From be1b054b050f5bd3e87ecc18926fa765910a57f8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Jan 2024 16:23:12 +0100 Subject: [PATCH 01/52] Compute chunk size based on the input data size ant the number of indexing threads --- milli/src/update/facet/bulk.rs | 17 +- milli/src/update/facet/incremental.rs | 14 +- milli/src/update/facet/mod.rs | 200 +++--- .../extract/extract_docid_word_positions.rs | 8 +- .../extract/extract_facet_string_docids.rs | 70 ++- .../src/update/index_documents/extract/mod.rs | 372 +++++------ .../index_documents/helpers/grenad_helpers.rs | 84 --- .../helpers/merge_functions.rs | 79 ++- .../src/update/index_documents/helpers/mod.rs | 8 +- milli/src/update/index_documents/mod.rs | 333 +++++----- .../src/update/index_documents/typed_chunk.rs | 588 ++++++++++++------ milli/src/update/word_prefix_docids.rs | 5 +- .../src/update/words_prefix_integer_docids.rs | 8 +- 13 files changed, 991 insertions(+), 795 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 3bd4cf5f5..8771cb6fe 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,7 +1,7 @@ use std::fs::File; use std::io::BufReader; -use grenad::CompressionType; +use grenad::{CompressionType, Merger}; use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -14,6 +14,7 @@ use crate::heed_codec::facet::{ use crate::heed_codec::BytesRefCodec; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; +use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases @@ -28,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> { facet_type: FacetType, field_ids: Vec, // None if level 0 does not need to be updated - delta_data: Option>>, + delta_data: Option, MergeFn>>, } impl<'i> FacetsUpdateBulk<'i> { @@ -36,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, field_ids: Vec, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { @@ -89,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, - pub delta_data: Option>, + pub delta_data: Option>, pub group_size: u8, pub min_level_size: u8, } @@ -129,8 +130,8 @@ impl FacetsUpdateBulkInner { if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); let mut database = self.db.iter_mut(wtxn)?.remap_types::(); - let mut cursor = delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = delta_data.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } @@ -154,8 +155,8 @@ impl FacetsUpdateBulkInner { let mut buffer = Vec::new(); let database = self.db.remap_types::(); - let mut cursor = delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = delta_data.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 78db218e3..722ccb1cb 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,6 +1,7 @@ use std::fs::File; use std::io::BufReader; +use grenad::Merger; use heed::types::{Bytes, DecodeIgnore}; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use obkv::KvReader; @@ -14,6 +15,7 @@ use crate::heed_codec::BytesRefCodec; use crate::search::facet::get_highest_level; use crate::update::del_add::DelAdd; use crate::update::index_documents::valid_lmdb_key; +use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; enum InsertionResult { @@ -31,14 +33,14 @@ enum DeletionResult { /// `facet_id_(string/f64)_docids` databases. pub struct FacetsUpdateIncremental { inner: FacetsUpdateIncrementalInner, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, } impl FacetsUpdateIncremental { pub fn new( index: &Index, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, group_size: u8, min_level_size: u8, max_group_size: u8, @@ -61,16 +63,18 @@ impl FacetsUpdateIncremental { } } + #[logging_timer::time("FacetsUpdateIncremental::{}")] pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { - let mut cursor = self.delta_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = self.delta_data.into_stream_merger_iter()?; + + while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; } + let key = FacetGroupKeyCodec::::bytes_decode(key) .map_err(heed::Error::Encoding)?; let value = KvReader::new(value); - let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 400507c97..ed451c7ce 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -79,12 +79,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::collections::BTreeSet; use std::fs::File; use std::io::BufReader; -use std::iter::FromIterator; -use charabia::normalizer::{Normalize, NormalizerOption}; -use grenad::{CompressionType, SortAlgorithm}; -use heed::types::{Bytes, DecodeIgnore, SerdeJson}; -use heed::BytesEncode; +use grenad::Merger; +use heed::types::{Bytes, DecodeIgnore}; use time::OffsetDateTime; use tracing::debug; @@ -93,9 +90,9 @@ use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::BytesRefCodec; -use crate::update::index_documents::create_sorter; -use crate::update::merge_btreeset_string; -use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::update::MergeFn; +use crate::{try_split_array_at, FieldId, Index, Result}; pub mod bulk; pub mod incremental; @@ -108,16 +105,20 @@ pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, + normalized_delta_data: Option, MergeFn>>, group_size: u8, max_group_size: u8, min_level_size: u8, + data_size: u64, } impl<'i> FacetsUpdate<'i> { pub fn new( index: &'i Index, facet_type: FacetType, - delta_data: grenad::Reader>, + delta_data: Merger, MergeFn>, + normalized_delta_data: Option, MergeFn>>, + data_size: u64, ) -> Self { let database = match facet_type { FacetType::String => { @@ -135,18 +136,20 @@ impl<'i> FacetsUpdate<'i> { min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, delta_data, + normalized_delta_data, + data_size, } } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - if self.delta_data.is_empty() { + if self.data_size == 0 { return Ok(()); } debug!("Computing and writing the facet values levels docids into LMDB on disk..."); self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.delta_data.len() >= (self.database.len(wtxn)? / 50) { + if self.data_size >= (self.database.len(wtxn)? / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( @@ -170,92 +173,94 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } - // We clear the list of normalized-for-search facets - // and the previous FSTs to compute everything from scratch - self.index.facet_id_normalized_string_strings.clear(wtxn)?; - self.index.facet_id_string_fst.clear(wtxn)?; + if let Some(normalized_delta_data) = self.normalized_delta_data { + let mut iter = normalized_delta_data.into_stream_merger_iter()?; + while let Some((key_bytes, delta_bytes)) = iter.next()? { + let deladd_reader = KvReaderDelAdd::new(delta_bytes); - // As we can't use the same write transaction to read and write in two different databases - // we must create a temporary sorter that we will write into LMDB afterward. - // As multiple unnormalized facet values can become the same normalized facet value - // we must merge them together. - let mut sorter = create_sorter( - SortAlgorithm::Unstable, - merge_btreeset_string, - CompressionType::None, - None, - None, - None, - ); + let database_set = self + .index + .facet_id_normalized_string_strings + .remap_key_type::() + .get(wtxn, &key_bytes)? + .unwrap_or_default(); - // We iterate on the list of original, semi-normalized, facet values - // and normalize them for search, inserting them in LMDB in any given order. - let options = NormalizerOption { lossy: true, ..Default::default() }; - let database = self.index.facet_id_string_docids.remap_data_type::(); - for result in database.iter(wtxn)? { - let (facet_group_key, ()) = result?; - if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { - let mut normalized_facet = left_bound.normalize(&options); - let normalized_truncated_facet: String; - if normalized_facet.len() > MAX_FACET_VALUE_LENGTH { - normalized_truncated_facet = normalized_facet - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - normalized_facet = normalized_truncated_facet.into(); + let add_set = deladd_reader + .get(DelAdd::Addition) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + .unwrap_or_default(); + + let del_set = match deladd_reader + .get(DelAdd::Deletion) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + { + Some(del_set) => { + let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + let mut set = BTreeSet::new(); + for facet in del_set { + let key = + FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() }; + // Check if the referenced value doesn't exist anymore before deleting it. + if self.index.facet_id_string_docids.get(wtxn, &key)?.is_none() { + set.insert(facet); + } + } + set + } + None => BTreeSet::new(), + }; + + let set: BTreeSet<_> = + database_set.difference(&del_set).chain(add_set.iter()).cloned().collect(); + + if set.is_empty() { + self.index + .facet_id_normalized_string_strings + .remap_key_type::() + .delete(wtxn, key_bytes)?; + } else { + self.index + .facet_id_normalized_string_strings + .remap_key_type::() + .put(wtxn, key_bytes, &set)?; } - let set = BTreeSet::from_iter(std::iter::once(left_bound)); - let key = (field_id, normalized_facet.as_ref()); - let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; - let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; - sorter.insert(key, val)?; + } + + // We clear the FST of normalized-for-search to compute everything from scratch. + self.index.facet_id_string_fst.clear(wtxn)?; + // We compute one FST by string facet + let mut text_fsts = vec![]; + let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; + let database = + self.index.facet_id_normalized_string_strings.remap_data_type::(); + for result in database.iter(wtxn)? { + let ((field_id, normalized_facet), _) = result?; + current_fst = match current_fst.take() { + Some((fid, fst_builder)) if fid != field_id => { + let fst = fst_builder.into_set(); + text_fsts.push((fid, fst)); + Some((field_id, fst::SetBuilder::memory())) + } + Some((field_id, fst_builder)) => Some((field_id, fst_builder)), + None => Some((field_id, fst::SetBuilder::memory())), + }; + + if let Some((_, fst_builder)) = current_fst.as_mut() { + fst_builder.insert(normalized_facet)?; + } + } + + if let Some((field_id, fst_builder)) = current_fst { + let fst = fst_builder.into_set(); + text_fsts.push((field_id, fst)); + } + + // We write those FSTs in LMDB now + for (field_id, fst) in text_fsts { + self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; } } - - // In this loop we don't need to take care of merging bitmaps - // as the grenad sorter already merged them for us. - let mut merger_iter = sorter.into_stream_merger_iter()?; - while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? { - self.index.facet_id_normalized_string_strings.remap_types::().put( - wtxn, - key_bytes, - btreeset_bytes, - )?; - } - - // We compute one FST by string facet - let mut text_fsts = vec![]; - let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; - let database = - self.index.facet_id_normalized_string_strings.remap_data_type::(); - for result in database.iter(wtxn)? { - let ((field_id, normalized_facet), _) = result?; - current_fst = match current_fst.take() { - Some((fid, fst_builder)) if fid != field_id => { - let fst = fst_builder.into_set(); - text_fsts.push((fid, fst)); - Some((field_id, fst::SetBuilder::memory())) - } - Some((field_id, fst_builder)) => Some((field_id, fst_builder)), - None => Some((field_id, fst::SetBuilder::memory())), - }; - - if let Some((_, fst_builder)) = current_fst.as_mut() { - fst_builder.insert(normalized_facet)?; - } - } - - if let Some((field_id, fst_builder)) = current_fst { - let fst = fst_builder.into_set(); - text_fsts.push((field_id, fst)); - } - - // We write those FSTs in LMDB now - for (field_id, fst) in text_fsts { - self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; - } - Ok(()) } } @@ -268,6 +273,7 @@ pub(crate) mod test_helpers { use std::marker::PhantomData; use std::rc::Rc; + use grenad::MergerBuilder; use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -280,7 +286,8 @@ pub(crate) mod test_helpers { use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; - use crate::update::FacetsUpdateIncrementalInner; + use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps; + use crate::update::{FacetsUpdateIncrementalInner, MergeFn}; use crate::CboRoaringBitmapCodec; /// Utility function to generate a string whose position in a lexicographically @@ -463,10 +470,13 @@ pub(crate) mod test_helpers { } writer.finish().unwrap(); let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + builder.push(reader.into_cursor().unwrap()); + let merger = builder.build(); let update = FacetsUpdateBulkInner { db: self.content, - delta_data: Some(reader), + delta_data: Some(merger), group_size: self.group_size.get(), min_level_size: self.min_level_size.get(), }; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index d568154b2..dc4886f00 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -26,7 +26,7 @@ pub fn extract_docid_word_positions( obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, - stop_words: Option<&fst::Set<&[u8]>>, + stop_words: Option<&fst::Set>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, @@ -181,11 +181,11 @@ fn searchable_fields_changed( /// Factorize tokenizer building. fn tokenizer_builder<'a>( - stop_words: Option<&'a fst::Set<&[u8]>>, + stop_words: Option<&'a fst::Set>>, allowed_separators: Option<&'a [&str]>, dictionary: Option<&'a [&str]>, script_language: Option<&'a HashMap>>, -) -> TokenizerBuilder<'a, &'a [u8]> { +) -> TokenizerBuilder<'a, Vec> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); @@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>( obkv: &KvReader, searchable_fields: &Option>, tokenizer: &Tokenizer, - stop_words: Option<&fst::Set<&[u8]>>, + stop_words: Option<&fst::Set>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: u32, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index d14be7464..8fdd11ee7 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,15 +1,21 @@ +use std::collections::BTreeSet; use std::fs::File; use std::io::BufReader; +use std::iter::FromIterator; use std::{io, str}; +use charabia::normalizer::{Normalize, NormalizerOption}; +use heed::types::SerdeJson; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; -use crate::heed_codec::StrRefCodec; -use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; -use crate::{FieldId, Result}; +use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; +use crate::update::index_documents::helpers::{ + merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, +}; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -19,10 +25,11 @@ use crate::{FieldId, Result}; pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, -) -> Result>> { +) -> Result<(grenad::Reader>, grenad::Reader>)> { puffin::profile_function!(); let max_memory = indexer.max_memory_by_thread(); + let options = NormalizerOption { lossy: true, ..Default::default() }; let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, @@ -30,12 +37,30 @@ pub fn extract_facet_string_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|m| m / 2), + ); + + let mut normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_btreeset_string, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), ); let mut buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { + let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + + // nothing to do if we delete and re-add the value. + if deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some() + { + continue; + } + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -44,17 +69,46 @@ pub fn extract_facet_string_docids( let document_id = u32::from_be_bytes(document_id_bytes); let normalized_value = str::from_utf8(normalized_value_bytes)?; + + // Facet search normalization + { + let mut hyper_normalized_value = normalized_value.normalize(&options); + let normalized_truncated_facet: String; + if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH { + normalized_truncated_facet = hyper_normalized_value + .char_indices() + .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + hyper_normalized_value = normalized_truncated_facet.into(); + } + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); buffer.clear(); let mut obkv = KvWriterDelAdd::new(&mut buffer); - for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { + for (deladd_key, _) in deladd_reader.iter() { obkv.insert(deladd_key, document_id.to_ne_bytes())?; } obkv.finish()?; facet_string_docids_sorter.insert(&key_bytes, &buffer)?; } - sorter_into_reader(facet_string_docids_sorter, indexer) + let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; + sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 44f54ff26..b8ff00125 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -15,7 +15,6 @@ use std::io::BufReader; use crossbeam_channel::Sender; use rayon::prelude::*; -use tracing::debug; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -29,10 +28,7 @@ use self::extract_vector_points::{ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; -use super::helpers::{ - as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, - MergeFn, MergeableReader, -}; +use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfigs; @@ -52,7 +48,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, field_id_map: FieldsIdsMap, - stop_words: Option>, + stop_words: Option>>, allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, @@ -62,201 +58,154 @@ pub(crate) fn data_from_obkv_documents( ) -> Result<()> { puffin::profile_function!(); - original_obkv_chunks - .par_bridge() - .map(|original_documents_chunk| { - send_original_documents_data( - original_documents_chunk, - indexer, - lmdb_writer_sx.clone(), - field_id_map.clone(), - embedders.clone(), - ) - }) - .collect::>()?; - - #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> = - flattened_obkv_chunks - .par_bridge() - .map(|flattened_obkv_chunks| { - send_and_extract_flattened_documents_data( - flattened_obkv_chunks, - indexer, - lmdb_writer_sx.clone(), - &searchable_fields, - &faceted_fields, - primary_key_id, - geo_fields_ids, - &stop_words, - &allowed_separators, - &dictionary, - max_positions_per_attributes, - ) - }) - .collect(); - - let ( - docid_word_positions_chunks, - ( - fid_docid_facet_numbers_chunks, - ( - fid_docid_facet_strings_chunks, - ( - facet_is_null_docids_chunks, - (facet_is_empty_docids_chunks, facet_exists_docids_chunks), - ), - ), - ), - ) = result?; - - // merge facet_exists_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-exists-docids", "merge"); - match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - // merge facet_is_null_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-is-null-docids", "merge"); - match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - // merge facet_is_empty_docids and send them as a typed chunk - { - let lmdb_writer_sx = lmdb_writer_sx.clone(); - rayon::spawn(move || { - debug!(database = "facet-id-is-empty-docids", "merge"); - match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { - Ok(reader) => { - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); - } - Err(e) => { - let _ = lmdb_writer_sx.send(Err(e)); - } - } - }); - } - - if proximity_precision == ProximityPrecision::ByWord { - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_pair_proximity_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::WordPairProximityDocids, - "word-pair-proximity-docids", - ); - } - - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_fid_word_count_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordCountDocids, - "field-id-wordcount-docids", - ); - - spawn_extraction_task::< - _, - _, - Vec<( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - )>, - >( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_deladd_cbo_roaring_bitmaps, - |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } + let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join( + || { + original_obkv_chunks + .par_bridge() + .map(|original_documents_chunk| { + send_original_documents_data( + original_documents_chunk, + indexer, + lmdb_writer_sx.clone(), + field_id_map.clone(), + embedders.clone(), + ) + }) + .collect::>() + }, + || { + flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + primary_key_id, + geo_fields_ids, + &stop_words, + &allowed_separators, + &dictionary, + max_positions_per_attributes, + ) + }) + .map(|result| { + if let Ok(( + ref docid_word_positions_chunk, + (ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk), + )) = result + { + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_fid_word_count_docids, + TypedChunk::FieldIdWordCountDocids, + "field-id-wordcount-docids", + ); + + let exact_attributes = exact_attributes.clone(); + run_extraction_task::< + _, + _, + ( + grenad::Reader>, + grenad::Reader>, + grenad::Reader>, + ), + >( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + move |doc_word_pos, indexer| { + extract_word_docids(doc_word_pos, indexer, &exact_attributes) + }, + |( + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + )| { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + }, + "word-docids", + ); + + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_position_docids, + TypedChunk::WordPositionDocids, + "word-position-docids", + ); + + run_extraction_task::< + _, + _, + (grenad::Reader>, grenad::Reader>), + >( + fid_docid_facet_strings_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_facet_string_docids, + TypedChunk::FieldIdFacetStringDocids, + "field-id-facet-string-docids", + ); + + run_extraction_task::<_, _, grenad::Reader>>( + fid_docid_facet_numbers_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_facet_number_docids, + TypedChunk::FieldIdFacetNumberDocids, + "field-id-facet-number-docids", + ); + + if proximity_precision == ProximityPrecision::ByWord { + run_extraction_task::<_, _, grenad::Reader>>( + docid_word_positions_chunk.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + } + } + + Ok(()) + }) + .collect::>() }, - "word-docids", ); - spawn_extraction_task::<_, _, Vec>>>( - docid_word_positions_chunks.clone(), - indexer, - lmdb_writer_sx.clone(), - extract_word_position_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::WordPositionDocids, - "word-position-docids", - ); - - spawn_extraction_task::<_, _, Vec>>>( - fid_docid_facet_strings_chunks, - indexer, - lmdb_writer_sx.clone(), - extract_facet_string_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetStringDocids, - "field-id-facet-string-docids", - ); - - spawn_extraction_task::<_, _, Vec>>>( - fid_docid_facet_numbers_chunks, - indexer, - lmdb_writer_sx, - extract_facet_number_docids, - merge_deladd_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetNumberDocids, - "field-id-facet-number-docids", - ); - - Ok(()) + original_pipeline_result.and(flattened_pipeline_result) } /// Spawn a new task to extract data for a specific DB using extract_fn. /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( - chunks: Vec>, +fn run_extraction_task( + chunk: grenad::Reader, indexer: GrenadParameters, lmdb_writer_sx: Sender>, extract_fn: FE, - merge_fn: MergeFn, serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, - M: MergeableReader + FromParallelIterator + Send + 'static, - M::Output: Send, + FS: Fn(M) -> TypedChunk + Sync + Send + 'static, + M: Send, { let current_span = tracing::Span::current(); @@ -264,25 +213,16 @@ fn spawn_extraction_task( let child_span = tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); - puffin::profile_scope!("extract_multiple_chunksdexing::details, ", name); - let chunks: Result = - chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect(); - let current_span = tracing::Span::current(); - - rayon::spawn(move || match chunks { - Ok(chunks) => { - let child_span = tracing::trace_span!(target: "", parent: ¤t_span, "merge_multiple_chunks"); - let _entered = child_span.enter(); - debug!(database = name, "merge"); - puffin::profile_scope!("merge_multiple_chunks", name); - let reader = chunks.merge(merge_fn, &indexer); - let _ = lmdb_writer_sx.send(reader.map(serialize_fn)); + puffin::profile_scope!("extract_multiple_chunks", name); + match extract_fn(chunk, indexer) { + Ok(chunk) => { + let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); } Err(e) => { let _ = lmdb_writer_sx.send(Err(e)); } - }) - }); + } + }) } /// Extract chunked data and send it into lmdb_writer_sx sender: @@ -340,7 +280,7 @@ fn send_original_documents_data( }); // TODO: create a custom internal error - lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); + drop(lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)))); Ok(()) } @@ -360,22 +300,13 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - stop_words: &Option>, + stop_words: &Option>>, allowed_separators: &Option<&[&str]>, dictionary: &Option<&[&str]>, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, - ( - grenad::Reader, - ( - grenad::Reader, - ( - grenad::Reader>, - (grenad::Reader>, grenad::Reader>), - ), - ), - ), + (grenad::Reader, grenad::Reader), )> { let flattened_documents_chunk = flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -446,16 +377,17 @@ fn send_and_extract_flattened_documents_data( fid_docid_facet_strings_chunk.clone(), ))); - Ok(( - fid_docid_facet_numbers_chunk, - ( - fid_docid_facet_strings_chunk, - ( - fid_facet_is_null_docids_chunk, - (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), - ), - ), - )) + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk))); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids( + fid_facet_is_empty_docids_chunk, + ))); + + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk))); + + Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk)) }, ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 3e63fcf77..b0e3654a9 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -90,90 +90,6 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub trait MergeableReader -where - Self: Sized, -{ - type Output; - - fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; -} - -impl MergeableReader for Vec>> { - type Output = grenad::Reader>; - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut merger = MergerBuilder::new(merge_fn); - self.into_iter().try_for_each(|r| merger.push(r))?; - merger.finish(params) - } -} - -impl MergeableReader for Vec<(grenad::Reader>, grenad::Reader>)> { - type Output = (grenad::Reader>, grenad::Reader>); - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut m1 = MergerBuilder::new(merge_fn); - let mut m2 = MergerBuilder::new(merge_fn); - for (r1, r2) in self.into_iter() { - m1.push(r1)?; - m2.push(r2)?; - } - Ok((m1.finish(params)?, m2.finish(params)?)) - } -} - -impl MergeableReader - for Vec<( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - )> -{ - type Output = ( - grenad::Reader>, - grenad::Reader>, - grenad::Reader>, - ); - - fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { - let mut m1 = MergerBuilder::new(merge_fn); - let mut m2 = MergerBuilder::new(merge_fn); - let mut m3 = MergerBuilder::new(merge_fn); - for (r1, r2, r3) in self.into_iter() { - m1.push(r1)?; - m2.push(r2)?; - m3.push(r3)?; - } - Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) - } -} - -struct MergerBuilder(grenad::MergerBuilder); - -impl MergerBuilder { - fn new(merge_fn: MergeFn) -> Self { - Self(grenad::MergerBuilder::new(merge_fn)) - } - - fn push(&mut self, reader: grenad::Reader) -> Result<()> { - self.0.push(reader.into_cursor()?); - Ok(()) - } - - fn finish(self, params: &GrenadParameters) -> Result>> { - let merger = self.0.build(); - let mut writer = create_writer( - params.chunk_compression_type, - params.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; - - writer_into_reader(writer) - } -} - #[derive(Debug, Clone, Copy)] pub struct GrenadParameters { pub chunk_compression_type: CompressionType, diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index d355ead68..7f5cc5dcd 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -35,27 +35,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // TODO improve the perf by using a `#[borrow] Cow`. - let strings: BTreeSet = values - .iter() - .map(AsRef::as_ref) - .map(serde_json::from_slice::>) - .map(StdResult::unwrap) - .reduce(|mut current, new| { - for x in new { - current.insert(x); - } - current - }) - .unwrap(); - Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap())) - } -} - pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } @@ -243,3 +222,61 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( buffer, )?) } + +pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // TODO improve the perf by using a `#[borrow] Cow`. + let strings: BTreeSet = values + .iter() + .map(AsRef::as_ref) + .map(serde_json::from_slice::>) + .map(StdResult::unwrap) + .reduce(|mut current, new| { + for x in new { + current.insert(x); + } + current + }) + .unwrap(); + Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap())) + } +} + +/// Do a union of BtreeSet on both sides of a DelAdd obkv +/// separately and outputs a new DelAdd with both unions. +pub fn merge_deladd_btreeset_string<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + // Retrieve the bitmaps from both sides + let mut del_set = BTreeSet::new(); + let mut add_set = BTreeSet::new(); + for value in values { + let obkv = KvReaderDelAdd::new(value); + if let Some(bytes) = obkv.get(DelAdd::Deletion) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + del_set.insert(value); + } + } + if let Some(bytes) = obkv.get(DelAdd::Addition) { + let set = serde_json::from_slice::>(bytes).unwrap(); + for value in set { + add_set.insert(value); + } + } + } + + let mut output_deladd_obkv = KvWriterDelAdd::memory(); + let del = serde_json::to_vec(&del_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Deletion, &del)?; + let add = serde_json::to_vec(&add_set).unwrap(); + output_deladd_obkv.insert(DelAdd::Addition, &add)?; + output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) + } +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 1e29c0240..b60f7be7d 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -10,13 +10,13 @@ use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, - GrenadParameters, MergeableReader, + GrenadParameters, }; pub use merge_functions::{ keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, - obkvs_merge_additions_and_deletions, MergeFn, + merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, + obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 36aa94964..912ff2c2d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -5,20 +5,21 @@ mod transform; mod typed_chunk; use std::collections::{HashMap, HashSet}; -use std::io::{Cursor, Read, Seek}; +use std::io::{Read, Seek}; use std::iter::FromIterator; use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; +use grenad::{Merger, MergerBuilder}; use heed::types::Str; use heed::Database; use rand::SeedableRng; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; -use tracing::debug_span; -use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; +use tracing::debug; +use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; @@ -26,8 +27,7 @@ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, - ClonableMmap, MergeFn, + merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -95,8 +95,8 @@ pub struct IndexDocumentsConfig { impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA> where - FP: Fn(UpdateIndexingStep) + Sync, - FA: Fn() -> bool + Sync, + FP: Fn(UpdateIndexingStep) + Sync + Send, + FA: Fn() -> bool + Sync + Send, { pub fn new( wtxn: &'t mut heed::RwTxn<'i>, @@ -326,9 +326,6 @@ where } }; - let original_documents = grenad::Reader::new(original_documents)?; - let flattened_documents = grenad::Reader::new(flattened_documents)?; - // create LMDB writer channel let (lmdb_writer_sx, lmdb_writer_rx): ( Sender>, @@ -367,11 +364,7 @@ where let stop_words = self.index.stop_words(self.wtxn)?; let separators = self.index.allowed_separators(self.wtxn)?; - let separators: Option> = - separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); let dictionary = self.index.dictionary(self.wtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); @@ -381,141 +374,202 @@ where max_memory: self.indexer_config.max_memory, max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. }; - let documents_chunk_size = - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let documents_chunk_size = match self.indexer_config.documents_chunk_size { + Some(chunk_size) => chunk_size, + None => { + let default_chunk_size = 1024 * 1024 * 4; // 4MiB + let min_chunk_size = 1024 * 512; // 512KiB + + // compute the chunk size from the number of available threads and the inputed data size. + let total_size = flattened_documents.metadata().map(|m| m.len()); + let current_num_threads = pool.current_num_threads(); + // if we have more than 2 thread, create a number of chunk equal to 3/4 threads count + let chunk_count = if current_num_threads > 2 { + (current_num_threads * 3 / 4).max(2) + } else { + current_num_threads + }; + total_size + .map_or(default_chunk_size, |size| (size as usize) / chunk_count) + .max(min_chunk_size) + } + }; + + let original_documents = grenad::Reader::new(original_documents)?; + let flattened_documents = grenad::Reader::new(flattened_documents)?; + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; let cloned_embedder = self.embedders.clone(); + let mut final_documents_ids = RoaringBitmap::new(); + let mut databases_seen = 0; + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + let mut chunk_accumulator = ChunkAccumulator::default(); + let mut dimension = HashMap::new(); + let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); + let current_span = tracing::Span::current(); // Run extraction pipeline in parallel. pool.install(|| { - let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); + rayon::spawn(move || { + let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); puffin::profile_scope!("extract_and_send_grenad_chunks"); - // split obkv file into several chunks - let original_chunk_iter = - grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + // split obkv file into several chunks + let original_chunk_iter = + grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); - // split obkv file into several chunks - let flattened_chunk_iter = - grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); + // split obkv file into several chunks + let flattened_chunk_iter = + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); - let result = original_chunk_iter.and_then(|original_chunk| { - let flattened_chunk = flattened_chunk_iter?; - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - original_chunk, - flattened_chunk, - pool_params, - lmdb_writer_sx.clone(), - searchable_fields, - faceted_fields, - primary_key_id, - geo_fields_ids, - field_id_map, - stop_words, - separators.as_deref(), - dictionary.as_deref(), - max_positions_per_attributes, - exact_attributes, - proximity_precision, - cloned_embedder, - ) + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let result = original_chunk_iter.and_then(|original_chunk| { + let flattened_chunk = flattened_chunk_iter?; + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + original_chunk, + flattened_chunk, + pool_params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + primary_key_id, + geo_fields_ids, + field_id_map, + stop_words, + separators.as_deref(), + dictionary.as_deref(), + max_positions_per_attributes, + exact_attributes, + proximity_precision, + cloned_embedder, + ) + }); + + if let Err(e) = result { + let _ = lmdb_writer_sx.send(Err(e)); + } + + // needs to be dropped to avoid channel waiting lock. + drop(lmdb_writer_sx); }); - if let Err(e) = result { - let _ = lmdb_writer_sx.send(Err(e)); - } + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); - // needs to be dropped to avoid channel waiting lock. - drop(lmdb_writer_sx); - }); + loop { + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } - let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; - let mut final_documents_ids = RoaringBitmap::new(); + match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) { + Err(status) => { + if let Some(typed_chunks) = chunk_accumulator.pop_longest() { + let (docids, is_merged_database) = + write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?; + if !docids.is_empty() { + final_documents_ids |= docids; + let documents_seen_count = final_documents_ids.len(); + (self.progress)(UpdateIndexingStep::IndexDocuments { + documents_seen: documents_seen_count as usize, + total_documents: documents_count, + }); + debug!(documents = documents_seen_count, total = documents_count, "Seen"); + } + if is_merged_database { + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + } + // If no more chunk remains in the chunk accumulator and the channel is disconected, break. + } else if status == crossbeam_channel::RecvTimeoutError::Disconnected { + break; + } + } + Ok(result) => { + let typed_chunk = match result? { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { + let cloneable_chunk = + unsafe { as_cloneable_grenad(&word_docids_reader)? }; + let word_docids = word_docids.get_or_insert_with(|| { + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + }); + word_docids.push(cloneable_chunk.into_cursor()?); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + let exact_word_docids = + exact_word_docids.get_or_insert_with(|| { + MergerBuilder::new( + merge_deladd_cbo_roaring_bitmaps as MergeFn, + ) + }); + exact_word_docids.push(cloneable_chunk.into_cursor()?); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + let word_fid_docids = word_fid_docids.get_or_insert_with(|| { + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn) + }); + word_fid_docids.push(cloneable_chunk.into_cursor()?); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + } + TypedChunk::WordPositionDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + let word_position_docids = + word_position_docids.get_or_insert_with(|| { + MergerBuilder::new( + merge_deladd_cbo_roaring_bitmaps as MergeFn, + ) + }); + word_position_docids.push(cloneable_chunk.into_cursor()?); + TypedChunk::WordPositionDocids(chunk) + } + TypedChunk::VectorPoints { + expected_dimension, + remove_vectors, + embeddings, + manual_vectors, + embedder_name, + } => { + dimension.insert(embedder_name.clone(), expected_dimension); + TypedChunk::VectorPoints { + remove_vectors, + embeddings, + expected_dimension, + manual_vectors, + embedder_name, + } + } + otherwise => otherwise, + }; - let mut databases_seen = 0; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - - let mut word_position_docids = None; - let mut word_fid_docids = None; - let mut word_docids = None; - let mut exact_word_docids = None; - - let mut dimension = HashMap::new(); - - for result in lmdb_writer_rx { - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - let typed_chunk = match result? { - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; - word_docids = Some(cloneable_chunk); - let cloneable_chunk = - unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; - exact_word_docids = Some(cloneable_chunk); - let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; - word_fid_docids = Some(cloneable_chunk); - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, + chunk_accumulator.insert(typed_chunk); } } - TypedChunk::WordPositionDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_position_docids = Some(cloneable_chunk); - TypedChunk::WordPositionDocids(chunk) - } - TypedChunk::VectorPoints { - expected_dimension, - remove_vectors, - embeddings, - manual_vectors, - embedder_name, - } => { - dimension.insert(embedder_name.clone(), expected_dimension); - TypedChunk::VectorPoints { - remove_vectors, - embeddings, - expected_dimension, - manual_vectors, - embedder_name, - } - } - otherwise => otherwise, - }; + } - let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; - if !docids.is_empty() { - final_documents_ids |= docids; - let documents_seen_count = final_documents_ids.len(); - (self.progress)(UpdateIndexingStep::IndexDocuments { - documents_seen: documents_seen_count as usize, - total_documents: documents_count, - }); - debug_span!("Seen", documents = documents_seen_count, total = documents_count); - } - if is_merged_database { - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - } - } + Ok(()) + })?; // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -548,10 +602,10 @@ where } self.execute_prefix_databases( - word_docids, - exact_word_docids, - word_position_docids, - word_fid_docids, + word_docids.map(MergerBuilder::build), + exact_word_docids.map(MergerBuilder::build), + word_position_docids.map(MergerBuilder::build), + word_fid_docids.map(MergerBuilder::build), )?; Ok(number_of_documents) @@ -565,10 +619,10 @@ where )] pub fn execute_prefix_databases( self, - word_docids: Option>, - exact_word_docids: Option>, - word_position_docids: Option>, - word_fid_docids: Option>, + word_docids: Option>, + exact_word_docids: Option>, + word_position_docids: Option>, + word_fid_docids: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -751,7 +805,7 @@ where )] fn execute_word_prefix_docids( txn: &mut heed::RwTxn, - reader: grenad::Reader>, + merger: Merger, word_docids_db: Database, word_prefix_docids_db: Database, indexer_config: &IndexerConfig, @@ -761,13 +815,12 @@ fn execute_word_prefix_docids( ) -> Result<()> { puffin::profile_function!(); - let cursor = reader.into_cursor()?; let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); builder.chunk_compression_type = indexer_config.chunk_compression_type; builder.chunk_compression_level = indexer_config.chunk_compression_level; builder.max_nb_chunks = indexer_config.max_nb_chunks; builder.max_memory = indexer_config.max_memory; - builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; + builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; Ok(()) } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index af828fee6..ef9b6707d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -5,27 +5,60 @@ use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; -use grenad::MergerBuilder; +use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::{PutFlags, RwTxn}; +use heed::RwTxn; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, - valid_lmdb_key, CursorClonableMmap, + self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key, + CursorClonableMmap, }; -use super::{ClonableMmap, MergeFn}; +use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; -use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; +use crate::update::index_documents::helpers::{ + as_cloneable_grenad, keep_latest_obkv, try_split_array_at, +}; use crate::{ lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, }; +/// This struct accumulates and group the TypedChunks +/// and is able to give the biggest accumulated group to index them all together +/// with a merger. +#[derive(Default)] +pub(crate) struct ChunkAccumulator { + inner: Vec>, +} + +impl ChunkAccumulator { + pub fn pop_longest(&mut self) -> Option> { + match self.inner.iter().max_by_key(|v| v.len()) { + Some(left) => { + let position = self.inner.iter().position(|right| left == right); + position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty()) + } + None => None, + } + } + + pub fn insert(&mut self, chunk: TypedChunk) { + match self.inner.iter().position(|right| Some(&chunk) == right.first()) { + Some(position) => { + let v = self.inner.get_mut(position).unwrap(); + v.push(chunk); + } + None => self.inner.push(vec![chunk]), + } + } +} + pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), @@ -38,7 +71,7 @@ pub(crate) enum TypedChunk { }, WordPositionDocids(grenad::Reader>), WordPairProximityDocids(grenad::Reader>), - FieldIdFacetStringDocids(grenad::Reader>), + FieldIdFacetStringDocids((grenad::Reader>, grenad::Reader>)), FieldIdFacetNumberDocids(grenad::Reader>), FieldIdFacetExistsDocids(grenad::Reader>), FieldIdFacetIsNullDocids(grenad::Reader>), @@ -54,6 +87,34 @@ pub(crate) enum TypedChunk { ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } +impl PartialEq for TypedChunk { + fn eq(&self, other: &Self) -> bool { + use TypedChunk::*; + match (self, other) { + (FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_)) + | (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_)) + | (Documents(_), Documents(_)) + | (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_)) + | (WordDocids { .. }, WordDocids { .. }) + | (WordPositionDocids(_), WordPositionDocids(_)) + | (WordPairProximityDocids(_), WordPairProximityDocids(_)) + | (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_)) + | (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_)) + | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) + | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) + | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) + | (GeoPoints(_), GeoPoints(_)) + | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true, + ( + VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, + VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, + ) => left == right && left_dim == right_dim, + _ => false, + } + } +} +impl Eq for TypedChunk {} + impl TypedChunk { pub fn to_debug_string(&self) -> String { match self { @@ -85,7 +146,7 @@ impl TypedChunk { TypedChunk::WordPairProximityDocids(grenad) => { format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdFacetStringDocids(grenad) => { + TypedChunk::FieldIdFacetStringDocids((grenad, _)) => { format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len()) } TypedChunk::FieldIdFacetNumberDocids(grenad) => { @@ -117,23 +178,32 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunk: TypedChunk, + typed_chunks: Vec, index: &Index, wtxn: &mut RwTxn, - index_is_empty: bool, ) -> Result<(RoaringBitmap, bool)> { - puffin::profile_function!(typed_chunk.to_debug_string()); + puffin::profile_function!(typed_chunks[0].to_debug_string()); let mut is_merged_database = false; - match typed_chunk { - TypedChunk::Documents(obkv_documents_iter) => { + match typed_chunks[0] { + TypedChunk::Documents(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "documents"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::Documents(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); let mut operations: Vec = Default::default(); let mut docids = index.documents_ids(wtxn)?; - let mut cursor = obkv_documents_iter.into_cursor()?; - while let Some((key, reader)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); @@ -174,59 +244,91 @@ pub(crate) fn write_typed_chunk_into_index( external_documents_docids.apply(wtxn, operations)?; index.put_documents_ids(wtxn, &docids)?; } - TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids"); let _entered = span.enter(); - append_entries_into_database( - fid_word_count_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.field_id_word_count_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::WordDocids { - word_docids_reader, - exact_word_docids_reader, - word_fid_docids_reader, - } => { + TypedChunk::WordDocids { .. } => { let span = tracing::trace_span!(target: "indexing::write_db", "word_docids"); let _entered = span.enter(); - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; - append_entries_into_database( - word_docids_iter.clone(), + + let mut word_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut exact_word_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut word_fid_docids_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } = typed_chunk + else { + unreachable!(); + }; + let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?; + let clonable_exact_word_docids = + unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + + word_docids_builder.push(word_docids_reader.into_cursor()?); + exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?); + word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?); + fst_merger_builder.push(clonable_word_docids.into_cursor()?); + fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?); + } + + let word_docids_merger = word_docids_builder.build(); + write_entries_into_database( + word_docids_merger, &index.word_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; - let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; - append_entries_into_database( - exact_word_docids_iter.clone(), + let exact_word_docids_merger = exact_word_docids_builder.build(); + write_entries_into_database( + exact_word_docids_merger, &index.exact_word_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; - let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; - append_entries_into_database( - word_fid_docids_iter, + let word_fid_docids_merger = word_fid_docids_builder.build(); + write_entries_into_database( + word_fid_docids_merger, &index.word_fid_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids - let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; + let fst_merger = fst_merger_builder.build(); + let fst = merge_word_docids_reader_into_fst(fst_merger)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -237,98 +339,202 @@ pub(crate) fn write_typed_chunk_into_index( index.put_words_fst(wtxn, &fst)?; is_merged_database = true; } - TypedChunk::WordPositionDocids(word_position_docids_iter) => { + TypedChunk::WordPositionDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids"); let _entered = span.enter(); - append_entries_into_database( - word_position_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordPositionDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.word_position_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + TypedChunk::FieldIdFacetNumberDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids"); let _entered = span.enter(); - let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut data_size = 0; + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk + else { + unreachable!(); + }; + + data_size += facet_id_number_docids.len(); + builder.push(facet_id_number_docids.into_cursor()?); + } + let merger = builder.build(); + + let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size); indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + TypedChunk::FieldIdFacetStringDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids"); let _entered = span.enter(); - let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + + let mut facet_id_string_builder = + MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + let mut normalized_facet_id_string_builder = + MergerBuilder::new(merge_deladd_btreeset_string as MergeFn); + let mut data_size = 0; + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetStringDocids(( + facet_id_string_docids, + normalized_facet_id_string_docids, + )) = typed_chunk + else { + unreachable!(); + }; + + data_size += facet_id_string_docids.len(); + facet_id_string_builder.push(facet_id_string_docids.into_cursor()?); + normalized_facet_id_string_builder + .push(normalized_facet_id_string_docids.into_cursor()?); + } + let facet_id_string_merger = facet_id_string_builder.build(); + let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build(); + + let indexer = FacetsUpdate::new( + index, + FacetType::String, + facet_id_string_merger, + Some(normalized_facet_id_string_merger), + data_size, + ); indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + TypedChunk::FieldIdFacetExistsDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_exists_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_exists_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => { + TypedChunk::FieldIdFacetIsNullDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_is_null_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_is_null_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => { + TypedChunk::FieldIdFacetIsEmptyDocids(_) => { let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids"); let _entered = span.enter(); - append_entries_into_database( - facet_id_is_empty_docids, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.facet_id_is_empty_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { + TypedChunk::WordPairProximityDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids"); let _entered = span.enter(); - append_entries_into_database( - word_pair_proximity_docids_iter, + + let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + + write_entries_into_database( + merger, &index.word_pair_proximity_docids, wtxn, - index_is_empty, deladd_serialize_add_side, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } - TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { + TypedChunk::FieldIdDocidFacetNumbers(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let index_fid_docid_facet_numbers = index.field_id_docid_facet_f64s.remap_types::(); - let mut cursor = fid_docid_facet_number.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { @@ -344,14 +550,25 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { + TypedChunk::FieldIdDocidFacetStrings(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let index_fid_docid_facet_strings = index.field_id_docid_facet_strings.remap_types::(); - let mut cursor = fid_docid_facet_string.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { let reader = KvReaderDelAdd::new(value); if valid_lmdb_key(key) { match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { @@ -367,14 +584,25 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::GeoPoints(geo_points) => { + TypedChunk::GeoPoints(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "geo_points"); let _entered = span.enter(); + + let mut builder = MergerBuilder::new(keep_first as MergeFn); + for typed_chunk in typed_chunks { + let TypedChunk::GeoPoints(chunk) = typed_chunk else { + unreachable!(); + }; + + builder.push(chunk.into_cursor()?); + } + let merger = builder.build(); + let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; - let mut cursor = geo_points.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -393,15 +621,38 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } - TypedChunk::VectorPoints { - remove_vectors, - manual_vectors, - embeddings, - expected_dimension, - embedder_name, - } => { + TypedChunk::VectorPoints { .. } => { let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); + + let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); + let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); + let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut params = None; + for typed_chunk in typed_chunks { + let TypedChunk::VectorPoints { + remove_vectors, + manual_vectors, + embeddings, + expected_dimension, + embedder_name, + } = typed_chunk + else { + unreachable!(); + }; + + params = Some((expected_dimension, embedder_name)); + + remove_vectors_builder.push(remove_vectors.into_cursor()?); + manual_vectors_builder.push(manual_vectors.into_cursor()?); + if let Some(embeddings) = embeddings { + embeddings_builder.push(embeddings.into_cursor()?); + } + } + + // typed chunks has always at least 1 chunk. + let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; @@ -419,8 +670,9 @@ pub(crate) fn write_typed_chunk_into_index( let writers = writers?; // remove vectors for docids we want them removed - let mut cursor = remove_vectors.into_cursor()?; - while let Some((key, _)) = cursor.move_on_next()? { + let merger = remove_vectors_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); for writer in &writers { @@ -432,40 +684,39 @@ pub(crate) fn write_typed_chunk_into_index( } // add generated embeddings - if let Some(embeddings) = embeddings { - let mut cursor = embeddings.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - let data = pod_collect_to_vec(value); - // it is a code error to have embeddings and not expected_dimension - let embeddings = - crate::vector::Embeddings::from_inner(data, expected_dimension) - // code error if we somehow got the wrong dimension - .unwrap(); + let merger = embeddings_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); + let data = pod_collect_to_vec(value); + // it is a code error to have embeddings and not expected_dimension + let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension) + // code error if we somehow got the wrong dimension + .unwrap(); - if embeddings.embedding_count() > usize::from(u8::MAX) { - let external_docid = if let Ok(Some(Ok(index))) = index - .external_id_of(wtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - }; - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - external_docid, - embeddings.embedding_count(), - ))); - } - for (embedding, writer) in embeddings.iter().zip(&writers) { - writer.add_item(wtxn, docid, embedding)?; - } + if embeddings.embedding_count() > usize::from(u8::MAX) { + let external_docid = if let Ok(Some(Ok(index))) = index + .external_id_of(wtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + }; + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + external_docid, + embeddings.embedding_count(), + ))); + } + for (embedding, writer) in embeddings.iter().zip(&writers) { + writer.add_item(wtxn, docid, embedding)?; } } // perform the manual diff - let mut cursor = manual_vectors.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let merger = manual_vectors_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) let (left, _index) = try_split_array_at(key).unwrap(); let docid = DocumentId::from_be_bytes(left); @@ -519,26 +770,30 @@ pub(crate) fn write_typed_chunk_into_index( tracing::debug!("Finished vector chunk for {}", embedder_name); } - TypedChunk::ScriptLanguageDocids(sl_map) => { + TypedChunk::ScriptLanguageDocids(_) => { let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids"); let _entered = span.enter(); - for (key, (deletion, addition)) in sl_map { - let mut db_key_exists = false; - let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(db_values) => { - db_key_exists = true; - (db_values - deletion) | addition - } - None => addition, - }; - if final_value.is_empty() { - // If the database entry exists, delete it. - if db_key_exists { - index.script_language_docids.delete(wtxn, &key)?; + for typed_chunk in typed_chunks { + let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() }; + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(db_values) => { + db_key_exists = true; + (db_values - deletion) | addition + } + None => addition, + }; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; } - } else { - index.script_language_docids.put(wtxn, &key, &final_value)?; } } } @@ -557,13 +812,9 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { } fn merge_word_docids_reader_into_fst( - word_docids_iter: grenad::Reader>, - exact_word_docids_iter: grenad::Reader>, + merger: Merger, ) -> Result>> { - let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); - merger_builder.push(word_docids_iter.into_cursor()?); - merger_builder.push(exact_word_docids_iter.into_cursor()?); - let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut iter = merger.into_stream_merger_iter()?; let mut builder = fst::SetBuilder::memory(); while let Some((k, _)) = iter.next()? { @@ -577,10 +828,9 @@ fn merge_word_docids_reader_into_fst( /// merge_values function is used if an entry already exist in the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] fn write_entries_into_database( - data: grenad::Reader, + merger: Merger, database: &heed::Database, wtxn: &mut RwTxn, - index_is_empty: bool, serialize_value: FS, merge_values: FM, ) -> Result<()> @@ -589,22 +839,17 @@ where FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, { - puffin::profile_function!(format!("number of entries: {}", data.len())); - + puffin::profile_function!(); let mut buffer = Vec::new(); let database = database.remap_types::(); - let mut cursor = data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { if valid_lmdb_key(key) { buffer.clear(); - let value = if index_is_empty { - Some(serialize_value(value, &mut buffer)?) - } else { - match database.get(wtxn, key)? { - Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, - None => Some(serialize_value(value, &mut buffer)?), - } + let value = match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), }; match value { Some(value) => database.put(wtxn, key, value)?, @@ -614,62 +859,5 @@ where } } } - - Ok(()) -} - -/// Write provided entries in database using serialize_value function. -/// merge_values function is used if an entry already exist in the database. -/// All provided entries must be ordered. -/// If the index is not empty, write_entries_into_database is called instead. -#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] -fn append_entries_into_database( - data: grenad::Reader, - database: &heed::Database, - wtxn: &mut RwTxn, - index_is_empty: bool, - serialize_value: FS, - merge_values: FM, -) -> Result<()> -where - R: io::Read + io::Seek, - FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, - FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, - K: for<'a> heed::BytesDecode<'a>, -{ - puffin::profile_function!(format!("number of entries: {}", data.len())); - - if !index_is_empty { - return write_entries_into_database( - data, - database, - wtxn, - false, - serialize_value, - merge_values, - ); - } - - let mut buffer = Vec::new(); - let mut database = database.iter_mut(wtxn)?.remap_types::(); - - let mut cursor = data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - debug_assert!( - K::bytes_decode(key).is_ok(), - "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", - key.len(), - &key - ); - buffer.clear(); - let value = serialize_value(value, &mut buffer)?; - unsafe { - // safety: We do not keep a reference to anything that lives inside the database - database.put_current_with_options::(PutFlags::APPEND, key, value)? - }; - } - } - Ok(()) } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 99c6c815e..1db066058 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { )] pub fn execute( self, - mut new_word_docids_iter: grenad::ReaderCursor, + new_word_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -68,7 +68,8 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> { if !common_prefix_fst_words.is_empty() { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?; + while let Some((word, data)) = new_word_docids_iter.next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index a05eb8721..272d465fd 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -52,7 +52,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { )] pub fn execute( self, - new_word_integer_docids: grenad::Reader, + new_word_integer_docids: grenad::Merger, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -69,14 +69,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> { self.max_memory, ); - let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?; - if !common_prefix_fst_words.is_empty() { // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { + let mut new_word_integer_docids_iter = + new_word_integer_docids.into_stream_merger_iter()?; + while let Some((key, data)) = new_word_integer_docids_iter.next()? { let (word, pos) = StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?; From 7877788510c4fcff564d30e92a7ba4c2da1f467a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 8 Feb 2024 17:37:15 +0100 Subject: [PATCH 02/52] fix logs --- milli/src/update/facet/bulk.rs | 2 +- milli/src/update/facet/incremental.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 8771cb6fe..888b1c4eb 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -66,7 +66,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } - #[logging_timer::time("FacetsUpdateBulk::{}")] + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 722ccb1cb..a28aa5a47 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -63,7 +63,7 @@ impl FacetsUpdateIncremental { } } - #[logging_timer::time("FacetsUpdateIncremental::{}")] + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")] pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { let mut iter = self.delta_data.into_stream_merger_iter()?; From 7efb1cae11a89ddd4c2f6b8cf803ec51e980c7e2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 8 Feb 2024 18:21:27 +0100 Subject: [PATCH 03/52] yield in loop when the channel is not disconnected --- milli/src/update/index_documents/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 912ff2c2d..de797541d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -497,6 +497,8 @@ where // If no more chunk remains in the chunk accumulator and the channel is disconected, break. } else if status == crossbeam_channel::RecvTimeoutError::Disconnected { break; + } else { + rayon::yield_now(); } } Ok(result) => { From 39c83cb3d9ff61367ef7c496061b918c38c2074a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 12 Feb 2024 09:09:41 +0100 Subject: [PATCH 04/52] fix clippy --- milli/src/update/facet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ed451c7ce..ad84e9cc3 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -182,7 +182,7 @@ impl<'i> FacetsUpdate<'i> { .index .facet_id_normalized_string_strings .remap_key_type::() - .get(wtxn, &key_bytes)? + .get(wtxn, key_bytes)? .unwrap_or_default(); let add_set = deladd_reader From 55de96f74e8e5965679f604de75290c48e8645c6 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Tue, 13 Feb 2024 14:22:10 +0100 Subject: [PATCH 05/52] Update milli/src/update/facet/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/facet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ad84e9cc3..ef40e3469 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -202,7 +202,7 @@ impl<'i> FacetsUpdate<'i> { let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() }; // Check if the referenced value doesn't exist anymore before deleting it. - if self.index.facet_id_string_docids.get(wtxn, &key)?.is_none() { + if self.index.facet_id_string_docids.get(wtxn, &key)?.remap_data::().is_none() { set.insert(facet); } } From e5e811e2c99ae1c81cd4a3909e713db1076818cd Mon Sep 17 00:00:00 2001 From: Many the fish Date: Tue, 13 Feb 2024 14:22:21 +0100 Subject: [PATCH 06/52] Update milli/src/update/index_documents/extract/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/index_documents/extract/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index b8ff00125..251a2db99 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -280,7 +280,7 @@ fn send_original_documents_data( }); // TODO: create a custom internal error - drop(lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)))); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); Ok(()) } From 48026aa75c6c12257a83e40b65accbec1712e455 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 13 Feb 2024 15:14:03 +0100 Subject: [PATCH 07/52] fix PR comments --- milli/src/update/facet/mod.rs | 188 ++++++++++-------- .../helpers/merge_functions.rs | 21 -- .../src/update/index_documents/helpers/mod.rs | 8 +- milli/src/update/index_documents/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 13 +- milli/src/update/mod.rs | 5 +- 6 files changed, 117 insertions(+), 124 deletions(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ef40e3469..ca5a21ce2 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -173,98 +173,110 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } - if let Some(normalized_delta_data) = self.normalized_delta_data { - let mut iter = normalized_delta_data.into_stream_merger_iter()?; - while let Some((key_bytes, delta_bytes)) = iter.next()? { - let deladd_reader = KvReaderDelAdd::new(delta_bytes); - - let database_set = self - .index - .facet_id_normalized_string_strings - .remap_key_type::() - .get(wtxn, key_bytes)? - .unwrap_or_default(); - - let add_set = deladd_reader - .get(DelAdd::Addition) - .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) - .unwrap_or_default(); - - let del_set = match deladd_reader - .get(DelAdd::Deletion) - .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) - { - Some(del_set) => { - let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap(); - let field_id = FieldId::from_be_bytes(field_id_bytes); - let mut set = BTreeSet::new(); - for facet in del_set { - let key = - FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() }; - // Check if the referenced value doesn't exist anymore before deleting it. - if self.index.facet_id_string_docids.get(wtxn, &key)?.remap_data::().is_none() { - set.insert(facet); - } - } - set - } - None => BTreeSet::new(), - }; - - let set: BTreeSet<_> = - database_set.difference(&del_set).chain(add_set.iter()).cloned().collect(); - - if set.is_empty() { - self.index - .facet_id_normalized_string_strings - .remap_key_type::() - .delete(wtxn, key_bytes)?; - } else { - self.index - .facet_id_normalized_string_strings - .remap_key_type::() - .put(wtxn, key_bytes, &set)?; - } - } - - // We clear the FST of normalized-for-search to compute everything from scratch. - self.index.facet_id_string_fst.clear(wtxn)?; - // We compute one FST by string facet - let mut text_fsts = vec![]; - let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; - let database = - self.index.facet_id_normalized_string_strings.remap_data_type::(); - for result in database.iter(wtxn)? { - let ((field_id, normalized_facet), _) = result?; - current_fst = match current_fst.take() { - Some((fid, fst_builder)) if fid != field_id => { - let fst = fst_builder.into_set(); - text_fsts.push((fid, fst)); - Some((field_id, fst::SetBuilder::memory())) - } - Some((field_id, fst_builder)) => Some((field_id, fst_builder)), - None => Some((field_id, fst::SetBuilder::memory())), - }; - - if let Some((_, fst_builder)) = current_fst.as_mut() { - fst_builder.insert(normalized_facet)?; - } - } - - if let Some((field_id, fst_builder)) = current_fst { - let fst = fst_builder.into_set(); - text_fsts.push((field_id, fst)); - } - - // We write those FSTs in LMDB now - for (field_id, fst) in text_fsts { - self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; - } + match self.normalized_delta_data { + Some(data) => index_facet_search(wtxn, data, self.index), + None => Ok(()), } - Ok(()) } } +fn index_facet_search( + wtxn: &mut heed::RwTxn, + normalized_delta_data: Merger, MergeFn>, + index: &Index, +) -> Result<()> { + let mut iter = normalized_delta_data.into_stream_merger_iter()?; + while let Some((key_bytes, delta_bytes)) = iter.next()? { + let deladd_reader = KvReaderDelAdd::new(delta_bytes); + + let database_set = index + .facet_id_normalized_string_strings + .remap_key_type::() + .get(wtxn, key_bytes)? + .unwrap_or_default(); + + let add_set = deladd_reader + .get(DelAdd::Addition) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + .unwrap_or_default(); + + let del_set = match deladd_reader + .get(DelAdd::Deletion) + .and_then(|bytes| serde_json::from_slice::>(bytes).ok()) + { + Some(del_set) => { + let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + let mut set = BTreeSet::new(); + for facet in del_set { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() }; + // Check if the referenced value doesn't exist anymore before deleting it. + if index + .facet_id_string_docids + .remap_data_type::() + .get(wtxn, &key)? + .is_none() + { + set.insert(facet); + } + } + set + } + None => BTreeSet::new(), + }; + + let set: BTreeSet<_> = + database_set.difference(&del_set).chain(add_set.iter()).cloned().collect(); + + if set.is_empty() { + index + .facet_id_normalized_string_strings + .remap_key_type::() + .delete(wtxn, key_bytes)?; + } else { + index + .facet_id_normalized_string_strings + .remap_key_type::() + .put(wtxn, key_bytes, &set)?; + } + } + + // We clear the FST of normalized-for-search to compute everything from scratch. + index.facet_id_string_fst.clear(wtxn)?; + // We compute one FST by string facet + let mut text_fsts = vec![]; + let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; + let database = index.facet_id_normalized_string_strings.remap_data_type::(); + for result in database.iter(wtxn)? { + let ((field_id, normalized_facet), _) = result?; + current_fst = match current_fst.take() { + Some((fid, fst_builder)) if fid != field_id => { + let fst = fst_builder.into_set(); + text_fsts.push((fid, fst)); + Some((field_id, fst::SetBuilder::memory())) + } + Some((field_id, fst_builder)) => Some((field_id, fst_builder)), + None => Some((field_id, fst::SetBuilder::memory())), + }; + + if let Some((_, fst_builder)) = current_fst.as_mut() { + fst_builder.insert(normalized_facet)?; + } + } + + if let Some((field_id, fst_builder)) = current_fst { + let fst = fst_builder.into_set(); + text_fsts.push((field_id, fst)); + } + + // We write those FSTs in LMDB now + for (field_id, fst) in text_fsts { + index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; + } + + Ok(()) +} + #[cfg(test)] pub(crate) mod test_helpers { use std::cell::Cell; diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 7f5cc5dcd..a265d152f 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -223,27 +223,6 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( )?) } -pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - // TODO improve the perf by using a `#[borrow] Cow`. - let strings: BTreeSet = values - .iter() - .map(AsRef::as_ref) - .map(serde_json::from_slice::>) - .map(StdResult::unwrap) - .reduce(|mut current, new| { - for x in new { - current.insert(x); - } - current - }) - .unwrap(); - Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap())) - } -} - /// Do a union of BtreeSet on both sides of a DelAdd obkv /// separately and outputs a new DelAdd with both unions. pub fn merge_deladd_btreeset_string<'a>( diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index b60f7be7d..5d8f16fae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,10 +13,10 @@ pub use grenad_helpers::{ GrenadParameters, }; pub use merge_functions::{ - keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, - merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, - obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, MergeFn, + keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, + obkvs_merge_additions_and_deletions, MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index de797541d..61ca1a024 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,9 +25,9 @@ use self::enrich::enrich_documents_batch; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, - merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn, + fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, + valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index ef9b6707d..07d77c68f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -41,7 +41,7 @@ impl ChunkAccumulator { pub fn pop_longest(&mut self) -> Option> { match self.inner.iter().max_by_key(|v| v.len()) { Some(left) => { - let position = self.inner.iter().position(|right| left == right); + let position = self.inner.iter().position(|right| left.len() == right.len()); position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty()) } None => None, @@ -49,7 +49,11 @@ impl ChunkAccumulator { } pub fn insert(&mut self, chunk: TypedChunk) { - match self.inner.iter().position(|right| Some(&chunk) == right.first()) { + match self + .inner + .iter() + .position(|right| right.first().map_or(false, |right| chunk.is_batchable_with(right))) + { Some(position) => { let v = self.inner.get_mut(position).unwrap(); v.push(chunk); @@ -87,8 +91,8 @@ pub(crate) enum TypedChunk { ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } -impl PartialEq for TypedChunk { - fn eq(&self, other: &Self) -> bool { +impl TypedChunk { + fn is_batchable_with(&self, other: &Self) -> bool { use TypedChunk::*; match (self, other) { (FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_)) @@ -113,7 +117,6 @@ impl PartialEq for TypedChunk { } } } -impl Eq for TypedChunk {} impl TypedChunk { pub fn to_debug_string(&self) -> String { diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 66c52a52f..195b95d1e 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -3,9 +3,8 @@ pub use self::clear_documents::ClearDocuments; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ - merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, - MergeFn, + merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, }; pub use self::indexer_config::IndexerConfig; pub use self::settings::{validate_embedding_settings, Setting, Settings}; From 55e942cd45c91705a038a009054aa35005fb90c8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 25 Jan 2024 18:58:52 +0100 Subject: [PATCH 08/52] buggy --- Cargo.lock | 647 +---------------------------------- meilisearch-types/Cargo.toml | 2 + meilisearch/Cargo.toml | 1 + milli/Cargo.toml | 13 +- 4 files changed, 21 insertions(+), 642 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c16dfd860..dfcc879e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -622,7 +622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.4.3", + "regex-automata", "serde", ] @@ -710,16 +710,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "calendrical_calculations" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8dfe3bc6a50b4667fafdb6d9cf26731c5418c457e317d8166c972014facf9a5d" -dependencies = [ - "core_maths", - "displaydoc", -] - [[package]] name = "camino" version = "1.1.6" @@ -878,9 +868,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb924701d850fbf0331302e7f9715c04e494b4b9bebb38ac48bdd30924e1936" +checksum = "cada616ef60b20e1156dc4b0bee5306109d1b1552438d44f7044841e9e447ebc" dependencies = [ "aho-corasick", "cow-utils", @@ -888,15 +878,12 @@ dependencies = [ "deunicode", "either", "fst", - "icu", - "icu_provider", - "icu_provider_blob", "irg-kvariants", "jieba-rs", "lindera-core", "lindera-dictionary", "lindera-tokenizer", - "litemap 0.6.1", + "litemap", "once_cell", "pinyin", "serde", @@ -904,7 +891,7 @@ dependencies = [ "unicode-normalization", "wana_kana", "whatlang", - "zerovec 0.9.6", + "zerovec", ] [[package]] @@ -1091,15 +1078,6 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" -[[package]] -name = "core_maths" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b02505ccb8c50b0aa21ace0fc08c3e53adebd4e58caa18a36152803c7709a3" -dependencies = [ - "libm", -] - [[package]] name = "cow-utils" version = "0.1.2" @@ -1509,17 +1487,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "displaydoc" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -1579,12 +1546,6 @@ dependencies = [ "serde", ] -[[package]] -name = "embedded-io" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" - [[package]] name = "encode_unicode" version = "0.3.6" @@ -1811,17 +1772,6 @@ dependencies = [ "unescaper", ] -[[package]] -name = "fixed_decimal" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5287d527037d0f35c8801880361eb38bb9bce194805350052c2a79538388faeb" -dependencies = [ - "displaydoc", - "smallvec", - "writeable", -] - [[package]] name = "flate2" version = "1.0.28" @@ -2425,487 +2375,6 @@ dependencies = [ "tokio-rustls 0.24.1", ] -[[package]] -name = "icu" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30f75f394ebee8d539bef8f6f02ad7b5f41c33de74c9eae1a50337b382a5aab1" -dependencies = [ - "icu_calendar", - "icu_casemap", - "icu_collator", - "icu_collections", - "icu_compactdecimal", - "icu_datetime", - "icu_decimal", - "icu_displaynames", - "icu_list", - "icu_locid", - "icu_locid_transform", - "icu_normalizer", - "icu_plurals", - "icu_properties", - "icu_provider", - "icu_relativetime", - "icu_segmenter", - "icu_timezone", - "icu_transliterate", -] - -[[package]] -name = "icu_calendar" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b520c5675775e3838447c33fc55bf558148c6824ef0d20ff7a9e0df7345a281c" -dependencies = [ - "calendrical_calculations", - "displaydoc", - "icu_calendar_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_calendar_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75d8d1a514ca7e6dc547be930f2fd661d578909c07cf1c1adade81c3f7a78840" - -[[package]] -name = "icu_casemap" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976068d7759293cbd9daa0d1669618bb9094c7ee54e546cd8b877dd4fe59007a" -dependencies = [ - "displaydoc", - "icu_casemap_data", - "icu_collections", - "icu_locid", - "icu_properties", - "icu_provider", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_casemap_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1251070c14d5b94cd00f97025e9cedce6a6eeb39485e2a226c58432cc4f72ffd" - -[[package]] -name = "icu_collator" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be493c81154545a00fc5196e814cae0e1470bc696d518b5df877049aa6bcefe1" -dependencies = [ - "displaydoc", - "icu_collator_data", - "icu_collections", - "icu_locid", - "icu_locid_transform", - "icu_normalizer", - "icu_properties", - "icu_provider", - "serde", - "smallvec", - "utf16_iter", - "utf8_iter", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_collator_data" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dbe9abe5ce570ad4707026f37bc21ef95c36b945c3c4564b9aa4e2e1c043126" - -[[package]] -name = "icu_collections" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3907b2246e8dd5a29ead8a965e7c0c8a90e9b928e614a4279257d45c5e553e91" -dependencies = [ - "displaydoc", - "serde", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_compactdecimal" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a8bb9143e7681fd5f5877c76f7b6365e173545d00d0e12ef23ba1888a996baa" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_compactdecimal_data", - "icu_decimal", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_compactdecimal_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e9b7585f26db531ea5aaedaa68cb66cd2be37fe698b33a289849ff3129545b" - -[[package]] -name = "icu_datetime" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5bf2e6dd961b59ee5935070220915db6cf0ab5137de362964f800c2b7d14fa" -dependencies = [ - "displaydoc", - "either", - "fixed_decimal", - "icu_calendar", - "icu_datetime_data", - "icu_decimal", - "icu_locid", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "icu_timezone", - "litemap 0.7.1", - "serde", - "smallvec", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_datetime_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078b2ed516a2f5054ee7f55b1fe970b92e90ae4cace8a0fe1e5f9fc2e94be609" - -[[package]] -name = "icu_decimal" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1986a0b7df834aaddb911b4593c990950ac5606fc83ce9aad4311be80f51e81a" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_decimal_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "writeable", -] - -[[package]] -name = "icu_decimal_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c064b3828953151f8c610bfff6fec776f958641249ebfd1cf36f073f0654e77" - -[[package]] -name = "icu_displaynames" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c98329d348e918ac7e88e6d6613a46bef09ca8a65db4ddf70d86e6eaac0e2ec3" -dependencies = [ - "icu_displaynames_data", - "icu_locid", - "icu_locid_transform", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_displaynames_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60f9f56c427f1e80383667e8fb13c07707f6561839283115617cc67307a5d020" - -[[package]] -name = "icu_list" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1a44bbed77a7e7b555f9d7dd4b43f75ec1402b438a901d20451943d50cbd90" -dependencies = [ - "deduplicating_array", - "displaydoc", - "icu_list_data", - "icu_locid_transform", - "icu_provider", - "regex-automata 0.2.0", - "serde", - "writeable", -] - -[[package]] -name = "icu_list_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3237583f0cb7feafabb567c4492fe9ef1d2d4113f6a8798a923273ea5de996d" - -[[package]] -name = "icu_locid" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f284eb342dc49d3e9d9f3b188489d76b5d22dfb1d1a5e0d1941811253bac625c" -dependencies = [ - "displaydoc", - "litemap 0.7.1", - "serde", - "tinystr", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_locid_transform" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6551daf80882d8e68eee186cc19e132d8bde1b1f059a79b93384a5ca0e8fc5e7" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_locid_transform_data", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_locid_transform_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a741eba5431f75eb2f1f9022d3cffabcadda6771e54fb4e77c8ba8653e4da44" - -[[package]] -name = "icu_normalizer" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080fc33a720d50a7342b0c58df010fbcfb842d6f78ef81555f8b1ac6bba57d3c" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "serde", - "smallvec", - "utf16_iter", - "utf8_iter", - "write16", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_normalizer_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8d22f74066c2e6442db2a9aa14950278e86719e811e304e48bae03094b369d" - -[[package]] -name = "icu_plurals" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20556516b8be2b2f5dc3d6b23884b65c5c59ed8be0b44c419e4808c9b0792fce" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_locid", - "icu_locid_transform", - "icu_plurals_data", - "icu_provider", - "serde", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_plurals_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc552215224997aaaa4e05d95981386d3c52042acebfcc732137d5d9be96a21" - -[[package]] -name = "icu_properties" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3477ae70f8ca8dc08ff7574b5398ed0a2f2e4e6b66bdff2558a92ed67e262be1" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locid_transform", - "icu_properties_data", - "icu_provider", - "serde", - "tinystr", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_properties_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c8bb3b67a8347e94d580434369e5c7ee89999b9309d04b7cfc88dfaa0f31b59" - -[[package]] -name = "icu_provider" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68acdef80034b5e35d8524e9817479d389a4f9774f3f0cbe1bf3884d80fd5934" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_provider_macros", - "postcard", - "serde", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_provider_blob" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31326d28c7f95a964a4f0ee86c24002da5f6db907e3bcb079949b4ff103b6a9" -dependencies = [ - "icu_provider", - "postcard", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_provider_macros" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2060258edfcfe32ca7058849bf0f146cb5c59aadbedf480333c0d0002f97bc99" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - -[[package]] -name = "icu_relativetime" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e6c1b531ab35f5b0cb552d3fb8dab1cb49f98e68e12bdc2169ca15e805207c" -dependencies = [ - "displaydoc", - "fixed_decimal", - "icu_decimal", - "icu_locid_transform", - "icu_plurals", - "icu_provider", - "icu_relativetime_data", - "serde", - "writeable", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_relativetime_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71ec2ca0aff8c6865075c6257bc91d21a77acb6465635306a280af89208bed24" - -[[package]] -name = "icu_segmenter" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcb3c1981ce2187a745f391a741cb14e77453325acb3b2e014b05da51c0a39f2" -dependencies = [ - "core_maths", - "displaydoc", - "icu_collections", - "icu_locid", - "icu_provider", - "icu_segmenter_data", - "serde", - "utf8_iter", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_segmenter_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9703f6713044d1c0a1335a6d78ffece4c9380582416ace6feeb608e84d279fc7" - -[[package]] -name = "icu_timezone" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e6401cd210ccda98b2e7fc707831b29c6efe319efbbec460f957b6f331f626" -dependencies = [ - "displaydoc", - "icu_calendar", - "icu_locid", - "icu_provider", - "icu_timezone_data", - "serde", - "tinystr", - "zerotrie", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_timezone_data" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d7e214a653bac59b768c42f82d252f13af95e8a9cb07b6108b8bc723c561b43" - -[[package]] -name = "icu_transliterate" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4bdf006774b5a5898d97af6c95b148d34cd5c87cbed00610ff873e5b5885e28" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locid", - "icu_normalizer", - "icu_properties", - "icu_provider", - "icu_unicodeset_parse", - "litemap 0.7.1", - "serde", - "zerovec 0.10.0", -] - -[[package]] -name = "icu_unicodeset_parse" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c3c1ab072cb9ec2dfb377ed7be07bf1bdce055b8324ba6392323f588c38c5a" -dependencies = [ - "icu_collections", - "icu_properties", - "icu_provider", - "tinystr", - "zerovec 0.10.0", -] - [[package]] name = "ident_case" version = "1.0.1" @@ -3530,15 +2999,6 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "575d8a551c59104b4df91269921e5eab561aa1b77c618dac0414b5d44a4617de" -[[package]] -name = "litemap" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a1a2647d5b7134127971a6de0d533c49de2159167e7f259c427195f87168a1" -dependencies = [ - "serde", -] - [[package]] name = "lmdb-master-sys" version = "0.1.0" @@ -4461,17 +3921,6 @@ version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bccab0e7fd7cc19f820a1c8c91720af652d0c88dc9664dd72aef2614f04af3b" -[[package]] -name = "postcard" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a55c51ee6c0db07e68448e336cf8ea4131a620edefebf9893e759b2d793420f8" -dependencies = [ - "cobs", - "embedded-io", - "serde", -] - [[package]] name = "powerfmt" version = "0.2.0" @@ -4733,19 +4182,10 @@ checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.3", + "regex-automata", "regex-syntax 0.8.2", ] -[[package]] -name = "regex-automata" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782" -dependencies = [ - "memchr", -] - [[package]] name = "regex-automata" version = "0.4.3" @@ -5229,9 +4669,6 @@ name = "smallvec" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2593d31f82ead8df961d8bd23a64c2ccf2eb5dd34b0a34bfb4dd54011c72009e" -dependencies = [ - "serde", -] [[package]] name = "smartstring" @@ -5546,17 +4983,6 @@ dependencies = [ "time-core", ] -[[package]] -name = "tinystr" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0e245e80bdc9b4e5356fc45a72184abbc3861992603f515270e9340f5a219" -dependencies = [ - "displaydoc", - "serde", - "zerovec 0.10.0", -] - [[package]] name = "tinytemplate" version = "1.2.1" @@ -5964,24 +5390,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" -[[package]] -name = "utf16_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52df8b7fb78e7910d776fccf2e42ceaf3604d55e8e7eb2dbd183cb1441d8a692" - [[package]] name = "utf8-width" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" -[[package]] -name = "utf8_iter" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33" - [[package]] name = "utf8parse" version = "0.2.1" @@ -6455,18 +5869,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "write16" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" - -[[package]] -name = "writeable" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0af0c3d13faebf8dda0b5256fa7096a2d5ccb662f7b9f54a40fe201077ab1c2" - [[package]] name = "xattr" version = "1.0.1" @@ -6554,20 +5956,6 @@ dependencies = [ "synstructure", ] -[[package]] -name = "zerotrie" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9685bb4deb98dab812e87c296a9631fc00d7ca4bc5c2c5f304f375bbed711a8a" -dependencies = [ - "displaydoc", - "litemap 0.7.1", - "serde", - "yoke", - "zerofrom", - "zerovec 0.10.0", -] - [[package]] name = "zerovec" version = "0.9.6" @@ -6577,29 +5965,6 @@ dependencies = [ "zerofrom", ] -[[package]] -name = "zerovec" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1194130c5b155bf8ae50ab16c86ab758cd695cf9ad176d2f870b744cbdbb572e" -dependencies = [ - "serde", - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acabf549809064225ff8878baedc4ce3732ac3b07e7c7ce6e5c2ccdbc485c324" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "zip" version = "0.6.6" diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index f5bfaa036..b5460fb56 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -54,3 +54,5 @@ thai = ["milli/thai"] greek = ["milli/greek"] # allow khmer specialized tokenization khmer = ["milli/khmer"] +# allow vietnamese specialized tokenization +vietnamese = ["milli/vietnamese"] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 7fbabba87..487013dd7 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -154,6 +154,7 @@ japanese = ["meilisearch-types/japanese"] thai = ["meilisearch-types/thai"] greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] +vietnamese = ["meilisearch-types/vietnamese"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 4bc05d2cc..66e25baed 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -102,7 +102,16 @@ meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] -all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"] +all-tokenizations = [ + "charabia/chinese", + "charabia/hebrew", + "charabia/japanese", + "charabia/thai", + "charabia/korean", + "charabia/greek", + "charabia/khmer", + "charabia/vietnamese", +] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml @@ -130,5 +139,7 @@ greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] +vietnamese = ["charabia/vietnamese"] + # allow CUDA support, see cuda = ["candle-core/cuda"] From 3b6544db6dd4f1fcd5b4908b23b8aa9c18f2325b Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 12 Feb 2024 11:06:37 +0100 Subject: [PATCH 09/52] Implement the experimental log mode cli flag --- Cargo.lock | 13 +++++ index-scheduler/src/features.rs | 2 +- meilisearch/Cargo.toml | 2 +- .../src/analytics/segment_analytics.rs | 7 ++- meilisearch/src/lib.rs | 21 +++++++-- meilisearch/src/main.rs | 45 ++++++++++++------ meilisearch/src/option.rs | 47 +++++++++++++++++++ meilisearch/src/routes/logs.rs | 29 +++++++++++- meilisearch/tests/common/server.rs | 12 ++++- meilisearch/tests/common/service.rs | 12 ++++- meilisearch/tests/logs/error.rs | 15 +++++- meilisearch/tests/logs/mod.rs | 13 +++-- 12 files changed, 187 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c16dfd860..c21474f1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5795,6 +5795,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.18" @@ -5802,11 +5812,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "nu-ansi-term", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index 4fd5bd0e7..3be18a3f1 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -48,7 +48,7 @@ impl RoFeatures { Ok(()) } else { Err(FeatureNotEnabledError { - disabled_action: "getting logs through the `/logs/stream` route", + disabled_action: "Modifying logs through the `/logs/*` routes", feature: "logs route", issue_link: "https://github.com/orgs/meilisearch/discussions/721", } diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 7fbabba87..0de87c69a 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,7 +104,7 @@ serde_urlencoded = "0.7.1" termcolor = "1.4.1" url = { version = "2.5.0", features = ["serde"] } tracing = "0.1.40" -tracing-subscriber = "0.3.18" +tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.9" diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index a38ddaab2..7e9fff925 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -28,7 +28,9 @@ use super::{ config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, }; use crate::analytics::Analytics; -use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot}; +use crate::option::{ + default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, +}; use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::tasks::TasksFilterQuery; @@ -250,6 +252,7 @@ impl super::Analytics for SegmentAnalytics { struct Infos { env: String, experimental_enable_metrics: bool, + experimental_logs_mode: LogMode, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, @@ -288,6 +291,7 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, + experimental_logs_mode, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, @@ -335,6 +339,7 @@ impl From for Infos { Self { env, experimental_enable_metrics, + experimental_logs_mode, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, db_path: db_path != PathBuf::from("./data.ms"), diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index c43a32cdc..01ca63857 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -97,11 +97,25 @@ pub type LogRouteType = tracing_subscriber::filter::Filtered< tracing_subscriber::Registry, >; +pub type SubscriberForSecondLayer = tracing_subscriber::layer::Layered< + tracing_subscriber::reload::Layer, + tracing_subscriber::Registry, +>; + +pub type LogStderrHandle = + tracing_subscriber::reload::Handle; + +pub type LogStderrType = tracing_subscriber::filter::Filtered< + Box + Send + Sync>, + Targets, + SubscriberForSecondLayer, +>; + pub fn create_app( index_scheduler: Data, auth_controller: Data, opt: Opt, - logs: LogRouteHandle, + logs: (LogRouteHandle, LogStderrHandle), analytics: Arc, enable_dashboard: bool, ) -> actix_web::App< @@ -444,7 +458,7 @@ pub fn configure_data( index_scheduler: Data, auth: Data, opt: &Opt, - logs: LogRouteHandle, + (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), analytics: Arc, ) { let http_payload_size_limit = opt.http_payload_size_limit.get_bytes() as usize; @@ -452,7 +466,8 @@ pub fn configure_data( .app_data(index_scheduler) .app_data(auth) .app_data(web::Data::from(analytics)) - .app_data(web::Data::new(logs)) + .app_data(web::Data::new(logs_route)) + .app_data(web::Data::new(logs_stderr)) .app_data( web::JsonConfig::default() .limit(http_payload_size_limit) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index ed18fb97e..231b1cc75 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -10,8 +10,10 @@ use actix_web::HttpServer; use index_scheduler::IndexScheduler; use is_terminal::IsTerminal; use meilisearch::analytics::Analytics; +use meilisearch::option::LogMode; use meilisearch::{ - analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, Opt, + analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, + LogStderrHandle, LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; use mimalloc::MiMalloc; @@ -23,28 +25,43 @@ use tracing_subscriber::Layer; #[global_allocator] static ALLOC: MiMalloc = MiMalloc; -fn default_layer() -> LogRouteType { +fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) } +fn default_log_stderr_layer(opt: &Opt) -> LogStderrType { + let layer = tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); + + let layer = match opt.experimental_logs_mode { + LogMode::Human => Box::new(layer) + as Box + Send + Sync>, + LogMode::Json => Box::new(layer.json()) + as Box + Send + Sync>, + }; + + layer.with_filter( + tracing_subscriber::filter::Targets::new() + .with_target("", LevelFilter::from_str(&opt.log_level.to_string()).unwrap()), + ) +} + /// does all the setup before meilisearch is launched -fn setup(opt: &Opt) -> anyhow::Result { - let (route_layer, route_layer_handle) = tracing_subscriber::reload::Layer::new(default_layer()); +fn setup(opt: &Opt) -> anyhow::Result<(LogRouteHandle, LogStderrHandle)> { + let (route_layer, route_layer_handle) = + tracing_subscriber::reload::Layer::new(default_log_route_layer()); let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer; - let subscriber = tracing_subscriber::registry().with(route_layer).with( - tracing_subscriber::fmt::layer() - .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE) - .with_filter( - tracing_subscriber::filter::LevelFilter::from_str(&opt.log_level.to_string()) - .unwrap(), - ), - ); + let (stderr_layer, stderr_layer_handle) = + tracing_subscriber::reload::Layer::new(default_log_stderr_layer(opt)); + let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer; + + let subscriber = tracing_subscriber::registry().with(route_layer).with(stderr_layer); // set the subscriber as the default for the application tracing::subscriber::set_global_default(subscriber).unwrap(); - Ok(route_layer_handle) + Ok((route_layer_handle, stderr_layer_handle)) } fn on_panic(info: &std::panic::PanicInfo) { @@ -110,7 +127,7 @@ async fn run_http( index_scheduler: Arc, auth_controller: Arc, opt: Opt, - logs: LogRouteHandle, + logs: (LogRouteHandle, LogStderrHandle), analytics: Arc, ) -> anyhow::Result<()> { let enable_dashboard = &opt.env == "development"; diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 9586a3f6f..2c41c8065 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -51,6 +51,7 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP"; const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; +const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = @@ -79,6 +80,39 @@ const DEFAULT_LOG_EVERY_N: usize = 100_000; pub const INDEX_SIZE: u64 = 2 * 1024 * 1024 * 1024 * 1024; // 2 TiB pub const TASK_DB_SIZE: u64 = 20 * 1024 * 1024 * 1024; // 20 GiB +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +#[serde(rename_all = "UPPERCASE")] +pub enum LogMode { + #[default] + Human, + Json, +} + +impl Display for LogMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LogMode::Human => Display::fmt("HUMAN", f), + LogMode::Json => Display::fmt("JSON", f), + } + } +} + +impl FromStr for LogMode { + type Err = LogModeError; + + fn from_str(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "human" => Ok(LogMode::Human), + "json" => Ok(LogMode::Json), + _ => Err(LogModeError(s.to_owned())), + } + } +} + +#[derive(Debug, thiserror::Error)] +#[error("Unsupported log {0} mode level. Supported values are `HUMAN` and `JSON`.")] +pub struct LogModeError(String); + #[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] #[serde(rename_all = "UPPERCASE")] pub enum LogLevel { @@ -310,6 +344,14 @@ pub struct Opt { #[serde(default)] pub experimental_enable_metrics: bool, + /// TODO: TAMO: update link + /// Experimental logs mode feature. For more information, see: + /// + /// Change the mode of the logs on the console. + #[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)] + #[serde(default)] + pub experimental_logs_mode: LogMode, + /// Experimental logs route feature. For more information, see: /// /// Enables the log route on the `POST /logs/stream` endpoint and the `DELETE /logs/stream` to stop receiving logs. @@ -422,6 +464,7 @@ impl Opt { #[cfg(feature = "analytics")] no_analytics, experimental_enable_metrics, + experimental_logs_mode, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, } = self; @@ -479,6 +522,10 @@ impl Opt { MEILI_EXPERIMENTAL_ENABLE_METRICS, experimental_enable_metrics.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_LOGS_MODE, + experimental_logs_mode.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE, experimental_enable_logs_route.to_string(), diff --git a/meilisearch/src/routes/logs.rs b/meilisearch/src/routes/logs.rs index d95f80bb8..fe6ed21f8 100644 --- a/meilisearch/src/routes/logs.rs +++ b/meilisearch/src/routes/logs.rs @@ -22,14 +22,15 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::LogRouteHandle; +use crate::{LogRouteHandle, LogStderrHandle}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("stream") .route(web::post().to(SeqHandler(get_logs))) .route(web::delete().to(SeqHandler(cancel_logs))), - ); + ) + .service(web::resource("stderr").route(web::post().to(SeqHandler(update_stderr_target)))); } #[derive(Debug, Default, Clone, Copy, Deserr, PartialEq, Eq)] @@ -279,3 +280,27 @@ pub async fn cancel_logs( Ok(HttpResponse::NoContent().finish()) } + +#[derive(Debug, Deserr)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub struct UpdateStderrLogs { + #[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError)] + target: MyTargets, +} + +pub async fn update_stderr_target( + index_scheduler: GuardedData, Data>, + logs: Data, + body: AwebJson, +) -> Result { + index_scheduler.features().check_logs_route()?; + + let opt = body.into_inner(); + + logs.modify(|layer| { + *layer.filter_mut() = opt.target.0.clone(); + }) + .unwrap(); + + Ok(HttpResponse::NoContent().finish()) +} diff --git a/meilisearch/tests/common/server.rs b/meilisearch/tests/common/server.rs index 134124cc8..41607f76d 100644 --- a/meilisearch/tests/common/server.rs +++ b/meilisearch/tests/common/server.rs @@ -9,7 +9,7 @@ use actix_web::http::StatusCode; use byte_unit::{Byte, ByteUnit}; use clap::Parser; use meilisearch::option::{IndexerOpts, MaxMemory, Opt}; -use meilisearch::{analytics, create_app, setup_meilisearch}; +use meilisearch::{analytics, create_app, setup_meilisearch, SubscriberForSecondLayer}; use once_cell::sync::Lazy; use tempfile::TempDir; use tokio::time::sleep; @@ -87,12 +87,20 @@ impl Server { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) + as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); actix_web::test::init_service(create_app( self.service.index_scheduler.clone().into(), self.service.auth.clone().into(), self.service.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&self.service.options), true, )) diff --git a/meilisearch/tests/common/service.rs b/meilisearch/tests/common/service.rs index 4c23a18d8..cd78253aa 100644 --- a/meilisearch/tests/common/service.rs +++ b/meilisearch/tests/common/service.rs @@ -5,7 +5,7 @@ use actix_web::http::StatusCode; use actix_web::test; use actix_web::test::TestRequest; use index_scheduler::IndexScheduler; -use meilisearch::{analytics, create_app, Opt}; +use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; use meilisearch_auth::AuthController; use tracing::level_filters::LevelFilter; use tracing_subscriber::Layer; @@ -111,12 +111,20 @@ impl Service { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) + as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); let app = test::init_service(create_app( self.index_scheduler.clone().into(), self.auth.clone().into(), self.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&self.options), true, )) diff --git a/meilisearch/tests/logs/error.rs b/meilisearch/tests/logs/error.rs index 4f4d741e3..637c84add 100644 --- a/meilisearch/tests/logs/error.rs +++ b/meilisearch/tests/logs/error.rs @@ -162,7 +162,7 @@ async fn logs_stream_without_enabling_the_route() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" @@ -173,7 +173,18 @@ async fn logs_stream_without_enabling_the_route() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server.service.post("/logs/stderr", json!({})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" diff --git a/meilisearch/tests/logs/mod.rs b/meilisearch/tests/logs/mod.rs index 0002fe33c..3b36d78f8 100644 --- a/meilisearch/tests/logs/mod.rs +++ b/meilisearch/tests/logs/mod.rs @@ -5,7 +5,7 @@ use std::str::FromStr; use actix_web::http::header::ContentType; use meili_snap::snapshot; -use meilisearch::{analytics, create_app, Opt}; +use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; @@ -27,18 +27,25 @@ async fn basic_test_log_stream_route() { tracing_subscriber::reload::Layer::new(None.with_filter( tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF), )); + let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new( + (Box::new( + tracing_subscriber::fmt::layer() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE), + ) as Box + Send + Sync>) + .with_filter(tracing_subscriber::filter::Targets::new()), + ); let subscriber = tracing_subscriber::registry().with(route_layer).with( tracing_subscriber::fmt::layer() .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE) - .with_filter(tracing_subscriber::filter::LevelFilter::from_str("INFO").unwrap()), + .with_filter(tracing_subscriber::filter::LevelFilter::from_str("OFF").unwrap()), ); let app = actix_web::test::init_service(create_app( server.service.index_scheduler.clone().into(), server.service.auth.clone().into(), server.service.options.clone(), - route_layer_handle, + (route_layer_handle, stderr_layer_handle), analytics::MockAnalytics::new(&server.service.options), true, )) From 3beda8833db9c8fb69df04fc69aa11f402f6762e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Feb 2024 11:46:30 +0100 Subject: [PATCH 10/52] Fix and add logs --- .../update/index_documents/extract/extract_vector_points.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 117f6cc8c..ece841659 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -257,6 +257,7 @@ fn push_vectors_diff( key_buffer: &mut Vec, delta: VectorStateDelta, ) -> Result<()> { + puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); if must_remove { key_buffer.truncate(TRUNCATE_SIZE); @@ -332,13 +333,14 @@ fn extract_vectors( } } -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] pub fn extract_embeddings( // docid, prompt prompt_reader: grenad::Reader, indexer: GrenadParameters, embedder: Arc, ) -> Result>> { + puffin::profile_function!(); let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk From 03bb6372af12504fe6d27d65a7fc2a5d2ad634b8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Feb 2024 11:50:22 +0100 Subject: [PATCH 11/52] Change is_batchable_with by mergeable_with --- milli/src/update/index_documents/typed_chunk.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 07d77c68f..1fea9a70f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -52,7 +52,7 @@ impl ChunkAccumulator { match self .inner .iter() - .position(|right| right.first().map_or(false, |right| chunk.is_batchable_with(right))) + .position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right))) { Some(position) => { let v = self.inner.get_mut(position).unwrap(); @@ -92,7 +92,7 @@ pub(crate) enum TypedChunk { } impl TypedChunk { - fn is_batchable_with(&self, other: &Self) -> bool { + fn mergeable_with(&self, other: &Self) -> bool { use TypedChunk::*; match (self, other) { (FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_)) From 78e04520fca1f8b832a4935a98d373fb8f4865e2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Feb 2024 15:16:16 +0100 Subject: [PATCH 12/52] Update charabia version --- Cargo.lock | 152 +++++++++++++++++++++++------------------------ milli/Cargo.toml | 2 +- 2 files changed, 75 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dfcc879e3..1f29b4b5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,7 +45,7 @@ dependencies = [ "actix-service", "actix-tls", "actix-utils", - "ahash 0.8.3", + "ahash", "base64 0.21.7", "bitflags 2.4.1", "brotli", @@ -183,7 +183,7 @@ dependencies = [ "actix-tls", "actix-utils", "actix-web-codegen", - "ahash 0.8.3", + "ahash", "bytes", "bytestring", "cfg-if", @@ -259,25 +259,15 @@ dependencies = [ [[package]] name = "ahash" -version = "0.7.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - -[[package]] -name = "ahash" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -304,6 +294,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "anes" version = "0.1.6" @@ -868,9 +864,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cada616ef60b20e1156dc4b0bee5306109d1b1552438d44f7044841e9e447ebc" +checksum = "3a9071b1586dd067b5fdfd2069fab932c047ca5bbce4bd2bdee8af0f4b155053" dependencies = [ "aho-corasick", "cow-utils", @@ -982,12 +978,6 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" -[[package]] -name = "cobs" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" - [[package]] name = "color-spantrace" version = "0.2.1" @@ -1325,15 +1315,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "deduplicating_array" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a636096586ca093a10ac0175bfb384d024089dca0dae54e3e69bc1c1596358e8" -dependencies = [ - "serde", -] - [[package]] name = "deranged" version = "0.3.9" @@ -1430,9 +1411,9 @@ dependencies = [ [[package]] name = "deunicode" -version = "1.3.3" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1bba4f227a4a53d12b653f50ca7bf10c9119ae2aba56aff9e0338b5c98f36a" +checksum = "3ae2a35373c5c74340b79ae6780b498b2b183915ec5dacf263aac5a099bf485a" [[package]] name = "digest" @@ -2177,20 +2158,15 @@ dependencies = [ "byteorder", ] -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.6", -] - [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heapless" @@ -2436,7 +2412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown", "serde", ] @@ -2558,7 +2534,7 @@ checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e" dependencies = [ "cedarwood", "fxhash", - "hashbrown 0.14.3", + "hashbrown", "lazy_static", "phf", "phf_codegen", @@ -2711,9 +2687,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f567a47e47b5420908424de2c6c5e424e3cafe588d0146bd128c0f3755758a3" +checksum = "a90d23f7cef31c6ab7ac0d4f3b23940754207f7b5a80b080c39193caffe99ac2" dependencies = [ "anyhow", "bincode", @@ -2730,9 +2706,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f3e553d55ebe9881fa5e5de588b0a153456e93564d17dfbef498912caf63a2" +checksum = "1927b7d2bd4ffc19e07691bf8609722663c341f80260a1c636cee8f1ec420dce" dependencies = [ "anyhow", "flate2", @@ -2741,9 +2717,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a2440cc156a4a911a174ec68203543d1efb10df3a700a59b6bf581e453c726" +checksum = "3299caa2b81c9a076535a4651a83bf7d624c15f2349f243187fffc64b5a78251" dependencies = [ "anyhow", "bincode", @@ -2758,9 +2734,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e077a410e61c962cb526f71b7effd62ffc607488a8f61869c937582d2ccb529b" +checksum = "7b82b8d2323a67dc8ff0c40751d199b7ba94cd5e3c13a5b31622d318acc79e5b" dependencies = [ "anyhow", "flate2", @@ -2769,9 +2745,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f57491adf7b311a3ee87f5e4a36454df16a2ec73de4ef28b2106fac80bd782" +checksum = "cddf783b459d54b130d956889bec052c25fcb478a304e03fa9b2289387572bc5" dependencies = [ "anyhow", "bincode", @@ -2789,9 +2765,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3476ec7748aebd2eb23d496ddfce5e7e0a5c031cffcd214451043e02d029f11" +checksum = "27c708f08f14b0806f6c4cce5324b4bcba27209463026b78c31f399f8be9d30d" dependencies = [ "anyhow", "bincode", @@ -2810,9 +2786,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b1c7576a02d5e4af2bf62de51790a01bc4b8bc0d0b6a6b86a46b157f5cb306d" +checksum = "e5e67eb91652203d202f7d27ead220d1d8c9099552709b8429eae9c70f2312fb" dependencies = [ "anyhow", "bincode", @@ -2831,9 +2807,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b713ecd5b827d7d448c3c5eb3c6d5899ecaf22cd17087599996349a02c76828d" +checksum = "d45da8d9a5888f4d4e78bb29fc82ff9ae519962efb0d2d92343b6cf8e373952f" dependencies = [ "bincode", "byteorder", @@ -2848,9 +2824,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e545752f6487be87b572529ad594cb3b48d2ef20821516f598b2d152d23277b" +checksum = "41c0933295dc945178bbc08f34111dc3ef22bfee38820f78453c8f8d4f3463d1" dependencies = [ "anyhow", "bincode", @@ -2868,9 +2844,9 @@ dependencies = [ [[package]] name = "lindera-tokenizer" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a2d4606a5a4da62ac4a3680ee884a75da7f0c892dc967fc9cb983ceba39a8f" +checksum = "348ce9bb3f2e5edc577420b98cca05b2177f3af50ef5ae278a1d8a1351d56197" dependencies = [ "bincode", "byteorder", @@ -2883,9 +2859,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388b1bdf81794b5d5b8057ce0321c58ff4b90d676b637948ccc7863ae2f43d28" +checksum = "74022a57c395ed7e213a9cd5833207e3c583145078ee9a164aeaec68b30c9d8e" dependencies = [ "bincode", "byteorder", @@ -2900,9 +2876,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdfa3e29a22c047da57fadd960ff674b720de15a1e2fb35b5ed67f3408afb469" +checksum = "a34e5564ee81af82603cd6a03c3abe6e17cc0ae598bfa5078809f06e59e96e08" dependencies = [ "anyhow", "bincode", @@ -2995,9 +2971,9 @@ dependencies = [ [[package]] name = "litemap" -version = "0.6.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "575d8a551c59104b4df91269921e5eab561aa1b77c618dac0414b5d44a4617de" +checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" @@ -3868,9 +3844,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pinyin" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd12336e3afa34152e002f57df37a7056778daa59ea542b3473b87f5fb260c4" +checksum = "16f2611cd06a1ac239a0cea4521de9eb068a6ca110324ee00631aa68daa74fc0" [[package]] name = "pkg-config" @@ -5594,11 +5570,11 @@ checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" [[package]] name = "whatlang" -version = "0.16.2" +version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043" +checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" dependencies = [ - "hashbrown 0.12.3", + "hashbrown", "once_cell", ] @@ -5935,6 +5911,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "zerofrom" version = "0.1.3" @@ -5958,9 +5954,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.9.6" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "591691014119b87047ead4dcf3e6adfbf73cb7c38ab6980d4f18a32138f35d46" +checksum = "eff4439ae91fb5c72b8abc12f3f2dbf51bd27e6eadb9f8a5bc8898dddb0e27ea" dependencies = [ "zerofrom", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 66e25baed..7e45168ed 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.5", default-features = false } +charabia = { version = "0.8.7", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" From a081da0d907b2f384ae33f770c03db6a8a482551 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 14 Feb 2024 15:34:39 +0100 Subject: [PATCH 13/52] add support for the json format in the stream route --- meilisearch/src/routes/logs.rs | 14 +++++++++++++- meilisearch/tests/logs/error.rs | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/meilisearch/src/routes/logs.rs b/meilisearch/src/routes/logs.rs index fe6ed21f8..57e2cbd22 100644 --- a/meilisearch/src/routes/logs.rs +++ b/meilisearch/src/routes/logs.rs @@ -38,6 +38,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub enum LogMode { #[default] Human, + Json, Profile, } @@ -166,7 +167,18 @@ fn make_layer< let fmt_layer = tracing_subscriber::fmt::layer() .with_writer(move || LogWriter { sender: sender.clone() }) - .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE); + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); + + let stream = byte_stream(receiver, guard); + (Box::new(fmt_layer) as Box + Send + Sync>, Box::pin(stream)) + } + LogMode::Json => { + let (sender, receiver) = tokio::sync::mpsc::unbounded_channel(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_writer(move || LogWriter { sender: sender.clone() }) + .json() + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); let stream = byte_stream(receiver, guard); (Box::new(fmt_layer) as Box + Send + Sync>, Box::pin(stream)) diff --git a/meilisearch/tests/logs/error.rs b/meilisearch/tests/logs/error.rs index 637c84add..93dcccd66 100644 --- a/meilisearch/tests/logs/error.rs +++ b/meilisearch/tests/logs/error.rs @@ -89,7 +89,7 @@ async fn logs_stream_bad_mode() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Unknown value `tamo` at `.mode`: expected one of `human`, `profile`", + "message": "Unknown value `tamo` at `.mode`: expected one of `human`, `json`, `profile`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" @@ -146,7 +146,7 @@ async fn logs_stream_bad_profile_memory() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Unknown value `fmt` at `.mode`: expected one of `human`, `profile`", + "message": "Unknown value `fmt` at `.mode`: expected one of `human`, `json`, `profile`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" From 5d3bad41200411462a017aed7b8683da940fd117 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 15 Feb 2024 10:31:23 +0100 Subject: [PATCH 14/52] Update meilisearch/src/option.rs Co-authored-by: Louis Dureuil --- meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 2c41c8065..88790cc9a 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -110,7 +110,7 @@ impl FromStr for LogMode { } #[derive(Debug, thiserror::Error)] -#[error("Unsupported log {0} mode level. Supported values are `HUMAN` and `JSON`.")] +#[error("Unsupported log mode level `{0}`. Supported values are `HUMAN` and `JSON`.")] pub struct LogModeError(String); #[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] From 1f8af81ba95df068e4f66376332cc6226d3cfc81 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 15 Feb 2024 10:32:48 +0100 Subject: [PATCH 15/52] update the log mode discussion link --- meilisearch/src/option.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 88790cc9a..f9e178e54 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -344,8 +344,7 @@ pub struct Opt { #[serde(default)] pub experimental_enable_metrics: bool, - /// TODO: TAMO: update link - /// Experimental logs mode feature. For more information, see: + /// Experimental logs mode feature. For more information, see: /// /// Change the mode of the logs on the console. #[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)] From d0974311130885ef4c92cf542a73358750f7a287 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 15 Feb 2024 10:58:43 +0100 Subject: [PATCH 16/52] Update meilisearch/src/option.rs Co-authored-by: Louis Dureuil --- meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index f9e178e54..cd99bf452 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -353,7 +353,7 @@ pub struct Opt { /// Experimental logs route feature. For more information, see: /// - /// Enables the log route on the `POST /logs/stream` endpoint and the `DELETE /logs/stream` to stop receiving logs. + /// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, and the `DELETE /logs/stream` to stop receiving logs. #[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE)] #[serde(default)] pub experimental_enable_logs_route: bool, From 4148d391b86f071f666f2b527348ffb360663d7b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 15 Feb 2024 14:25:25 +0100 Subject: [PATCH 17/52] move logs to stderr --- meilisearch/src/main.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index 231b1cc75..f1f93dd99 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -1,5 +1,5 @@ use std::env; -use std::io::{stderr, Write}; +use std::io::{stderr, LineWriter, Write}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -31,6 +31,7 @@ fn default_log_route_layer() -> LogRouteType { fn default_log_stderr_layer(opt: &Opt) -> LogStderrType { let layer = tracing_subscriber::fmt::layer() + .with_writer(|| LineWriter::new(std::io::stderr())) .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE); let layer = match opt.experimental_logs_mode { From 865b415b3f706e12c76aef675877e84c0f3a2075 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 15 Feb 2024 16:00:48 +0100 Subject: [PATCH 18/52] Add test rerpoducing bug --- .../tests/documents/update_documents.rs | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/meilisearch/tests/documents/update_documents.rs b/meilisearch/tests/documents/update_documents.rs index b4f61bf99..a5d466513 100644 --- a/meilisearch/tests/documents/update_documents.rs +++ b/meilisearch/tests/documents/update_documents.rs @@ -1,4 +1,4 @@ -use meili_snap::snapshot; +use meili_snap::{json_string, snapshot}; use crate::common::encoder::Encoder; use crate::common::{GetAllDocumentsOptions, Server}; @@ -209,3 +209,93 @@ async fn error_update_documents_missing_document_id() { "https://docs.meilisearch.com/errors#missing_document_id" ); } + +#[actix_rt::test] +async fn update_faceted_document() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["facet:asc"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + let documents: Vec<_> = (0..1000) + .map(|id| { + json!({ + "doc_id": id, + "facet": (id/3), + }) + }) + .collect(); + + let (_response, code) = index.add_documents(documents.into(), None).await; + assert_eq!(code, 202); + + index.wait_task(1).await; + + let documents = json!([ + { + "doc_id": 9, + "facet": 1.5, + } + ]); + + let (response, code) = index.update_documents(documents, None).await; + assert_eq!(code, 202, "response: {}", response); + + index.wait_task(2).await; + + index + .search(json!({"limit": 10}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "doc_id": 0, + "facet": 0 + }, + { + "doc_id": 1, + "facet": 0 + }, + { + "doc_id": 2, + "facet": 0 + }, + { + "doc_id": 3, + "facet": 1 + }, + { + "doc_id": 4, + "facet": 1 + }, + { + "doc_id": 5, + "facet": 1 + }, + { + "doc_id": 9, + "facet": 1.5 + }, + { + "doc_id": 6, + "facet": 2 + }, + { + "doc_id": 7, + "facet": 2 + }, + { + "doc_id": 8, + "facet": 2 + } + ] + "###); + }) + .await; +} From 9d1f489a37927c730060f195140ca368a8b4aa5d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 21 Feb 2024 18:42:16 +0100 Subject: [PATCH 19/52] Fix facet incremental indexing --- milli/src/update/facet/incremental.rs | 671 ++++++++++++++------------ milli/src/update/facet/mod.rs | 6 +- 2 files changed, 361 insertions(+), 316 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a28aa5a47..d68540967 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -18,15 +18,13 @@ use crate::update::index_documents::valid_lmdb_key; use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; -enum InsertionResult { +enum ModificationResult { InPlace, Expand, Insert, -} -enum DeletionResult { - InPlace, Reduce { next: Option> }, Remove { next: Option> }, + Nothing, } /// Algorithm to incrementally insert and delete elememts into the @@ -65,8 +63,9 @@ impl FacetsUpdateIncremental { #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")] pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { + let mut current_field_id = None; + let mut facet_level_may_be_updated = false; let mut iter = self.delta_data.into_stream_merger_iter()?; - while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; @@ -74,25 +73,47 @@ impl FacetsUpdateIncremental { let key = FacetGroupKeyCodec::::bytes_decode(key) .map_err(heed::Error::Encoding)?; + + if facet_level_may_be_updated + && current_field_id.map_or(false, |fid| fid != key.field_id) + { + // Only add or remove a level after making all the field modifications. + self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?; + facet_level_may_be_updated = false; + } + current_field_id = Some(key.field_id); + let value = KvReader::new(value); let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; let docids_to_add = value .get(DelAdd::Addition) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; - if let Some(docids_to_delete) = docids_to_delete { - let docids_to_delete = docids_to_delete?; - self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + let level_size_changed = self.inner.modify( + wtxn, + key.field_id, + key.left_bound, + docids_to_add.as_ref(), + docids_to_delete.as_ref(), + )?; + + if level_size_changed { + // if a node has been added or removed from the highest level, + // we may have to update the facet level. + facet_level_may_be_updated = true; } + } - if let Some(docids_to_add) = docids_to_add { - let docids_to_add = docids_to_add?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + if let Some(field_id) = current_field_id { + if facet_level_may_be_updated { + self.inner.add_or_delete_level(wtxn, field_id)?; } } @@ -166,138 +187,78 @@ impl FacetsUpdateIncrementalInner { /// /// ## Return /// See documentation of `insert_in_level` - fn insert_in_level_0( + fn modify_in_level_0( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; - let mut level0_prefix = vec![]; - level0_prefix.extend_from_slice(&field_id.to_be_bytes()); - level0_prefix.push(0); - - let mut iter = - self.db.remap_types::().prefix_iter(txn, &level0_prefix)?; - - if iter.next().is_none() { - drop(iter); - self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) - } else { - drop(iter); - let old_value = self.db.get(txn, &key)?; - match old_value { - Some(mut updated_value) => { - // now merge the two - updated_value.bitmap |= value.bitmap; - self.db.put(txn, &key, &updated_value)?; - Ok(InsertionResult::InPlace) - } - None => { + let old_value = self.db.get(txn, &key)?; + match (old_value, add_docids, del_docids) { + // Addition + deletion on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => { + let value = FacetGroupValue { bitmap: bitmap - del_docids | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), None) => { + let value = FacetGroupValue { bitmap: bitmap | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition of a new value (ignore deletion) + (None, Some(add_docids), _) => { + let value = FacetGroupValue { bitmap: add_docids.clone(), size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::Insert) + } + // Deletion on an existing value, fully delete the key if the resulted value is empty. + (Some(FacetGroupValue { mut bitmap, .. }), None, Some(del_docids)) => { + bitmap -= del_docids; + if bitmap.is_empty() { + // Full deletion + let mut next_key = None; + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(ModificationResult::Remove { next: next_key }) + } else { + // Partial deletion + let value = FacetGroupValue { bitmap, size: 1 }; self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::InPlace) } } + // Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing), + // may be unreachable at some point. + (None, None, _) | (Some(_), None, None) => Ok(ModificationResult::Nothing), } } - /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. - /// This function works recursively. + /// Split a level node into two balanced nodes. /// - /// ## Return - /// Returns the effect of adding the facet value to the database on the given `level`. - /// - /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have - /// an effect on the number of keys in that level. Therefore, it did not increase the number of children - /// of the parent node. - /// - /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted - /// in the addition of a new key in that level, and that therefore the number of children - /// of the parent node should be incremented. - fn insert_in_level( + /// # Return + /// Returns `ModificationResult::Insert` if the split is successful. + fn split_group( &self, txn: &mut RwTxn, field_id: u16, level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.insert_in_level_0(txn, field_id, facet_value, docids); - } - - let max_group_size = self.max_group_size; - - let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?; - // level below inserted an element - - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - match result { - // because we know that we inserted in place, the facet_value is not a new one - // thus it doesn't extend a group, and thus the insertion key computed above is - // still correct - InsertionResult::InPlace => { - let mut updated_value = insertion_value; - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - - return Ok(InsertionResult::InPlace); - } - InsertionResult::Expand => {} - InsertionResult::Insert => {} - } - - // Here we know that inserting the facet value in the level below resulted in the creation - // of a new key. Therefore, it may be the case that we need to modify the left bound of the - // insertion key (see documentation of `find_insertion_key_value` for an example of when that - // could happen). - let (insertion_key, insertion_key_was_modified) = { - let mut new_insertion_key = insertion_key.clone(); - let mut key_should_be_modified = false; - - if facet_value < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = facet_value.to_vec(); - key_should_be_modified = true; - } - if key_should_be_modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - (new_insertion_key, key_should_be_modified) - }; - // Now we know that the insertion key contains the `facet_value`. - - // We still need to update the insertion value by: - // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) - // 2. Merge the previous docids with the new one - let mut updated_value = insertion_value; - - if matches!(result, InsertionResult::Insert) { - updated_value.size += 1; - } - - if updated_value.size < max_group_size { - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - if insertion_key_was_modified { - return Ok(InsertionResult::Expand); - } else { - return Ok(InsertionResult::InPlace); - } - } - - // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` - // Therefore it must be split into two nodes. - - let size_left = updated_value.size / 2; - let size_right = updated_value.size - size_left; + insertion_key: FacetGroupKey>, + insertion_value: FacetGroupValue, + ) -> Result { + let size_left = insertion_value.size / 2; + let size_right = insertion_value.size - size_left; let level_below = level - 1; @@ -351,34 +312,249 @@ impl FacetsUpdateIncrementalInner { self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::Insert) } - /// Insert the given facet value and corresponding document ids in the database. - pub fn insert( + fn trim_del_docids<'a>( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + insertion_key: &FacetGroupKey>, + insertion_value_size: usize, + del_docids: &'a RoaringBitmap, + ) -> Result> { + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut del_docids = std::borrow::Cow::Borrowed(del_docids); + let iter = self.db.range(txn, &(start_key..))?.take(insertion_value_size); + for next in iter { + let (_, value) = next?; + // if a sublevel bitmap as common docids with del_docids, + // then these docids shouldn't be removed and so, remove them from the deletion list. + if !value.bitmap.is_disjoint(&del_docids) { + *del_docids.to_mut() -= value.bitmap; + } + } + + Ok(del_docids) + } + + /// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of modifying the facet value to the database on the given `level`. + /// + /// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. + /// + /// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. + /// + /// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. + /// This case is reachable when a document id is removed from a sub-level node but is still present in another one. + /// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. + fn modify_in_level( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if level == 0 { + return self.modify_in_level_0(txn, field_id, facet_value, add_docids, del_docids); + } + + let result = + self.modify_in_level(txn, field_id, level - 1, facet_value, add_docids, del_docids)?; + // level below inserted an element + + if let ModificationResult::Nothing = result { + // if the previous level has not been modified, + // early return ModificationResult::Nothing. + return Ok(ModificationResult::Nothing); + } + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + let insertion_value_size = insertion_value.size as usize; + + let mut insertion_value_was_modified = false; + let mut updated_value = insertion_value; + + if let ModificationResult::Insert = result { + // if a key has been inserted in the sub-level raise the value size. + updated_value.size += 1; + insertion_value_was_modified = true; + } else if let ModificationResult::Remove { .. } = result { + if updated_value.size <= 1 { + // if the only remaining node is the one to delete, + // delete the key instead and early return. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + return Ok(result); + } else { + // Reduce the value size + updated_value.size -= 1; + insertion_value_was_modified = true; + } + } + + let (insertion_key, insertion_key_modification) = + if let ModificationResult::InPlace = result { + (insertion_key, ModificationResult::InPlace) + } else { + // Inserting or deleting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let mut new_insertion_key = insertion_key.clone(); + let mut key_modification = ModificationResult::InPlace; + + if let ModificationResult::Remove { next } | ModificationResult::Reduce { next } = + result + { + // if the deleted facet_value is the left_bound of the current node, + // the left_bound should be updated reducing the current node. + let reduced_range = facet_value == insertion_key.left_bound; + if reduced_range { + new_insertion_key.left_bound = next.clone().unwrap(); + key_modification = ModificationResult::Reduce { next }; + } + } else if facet_value < insertion_key.left_bound.as_slice() { + // if the added facet_value is the under the left_bound of the current node, + // the left_bound should be updated expanding the current node. + new_insertion_key.left_bound = facet_value.to_vec(); + key_modification = ModificationResult::Expand; + } + + if matches!( + key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) { + // if the node should be updated, delete it, it will be recreated using a new key later. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + } + (new_insertion_key, key_modification) + }; + + if updated_value.size < self.max_group_size { + // If there are docids to delete, trim them avoiding unexpected removal. + let del_docids = del_docids + .map(|ids| { + self.trim_del_docids( + txn, + field_id, + level, + &insertion_key, + insertion_value_size, + ids, + ) + }) + .transpose()? + .filter(|ids| !ids.is_empty()); + if let Some(del_docids) = del_docids { + updated_value.bitmap -= &*del_docids; + insertion_value_was_modified = true; + } + + if let Some(add_docids) = add_docids { + updated_value.bitmap |= add_docids; + insertion_value_was_modified = true; + } + + if insertion_value_was_modified + || matches!( + insertion_key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) + { + // if any modification occured, insert it in the database. + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + Ok(insertion_key_modification) + } else { + // this case is reachable when a docid is removed from a sub-level node but is still present in another one. + // For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4]. + Ok(ModificationResult::Nothing) + } + } else { + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + self.split_group(txn, field_id, level, insertion_key, updated_value) + } + } + + /// Modify the given facet value and corresponding document ids in the database. + /// If no more document ids correspond to the facet value, delete it completely. + /// + /// ## Return + /// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential + /// addition or deletion of a facet level. + /// Otherwise returns `false` if the tree-nodes have been modified in place. + pub fn modify( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if docids.is_empty() { - return Ok(()); + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if add_docids.map_or(true, RoaringBitmap::is_empty) + && del_docids.map_or(true, RoaringBitmap::is_empty) + { + return Ok(false); } - let group_size = self.group_size; let highest_level = get_highest_level(txn, self.db, field_id)?; - let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?; + let result = self.modify_in_level( + txn, + field_id, + highest_level, + facet_value, + add_docids, + del_docids, + )?; match result { - InsertionResult::InPlace => return Ok(()), - InsertionResult::Expand => return Ok(()), - InsertionResult::Insert => {} + ModificationResult::InPlace + | ModificationResult::Expand + | ModificationResult::Nothing + | ModificationResult::Reduce { .. } => Ok(false), + ModificationResult::Insert | ModificationResult::Remove { .. } => Ok(true), } + } - // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. - // If it has, we must build an addition level above it. - + /// Check whether the highest level has exceeded `min_level_size` * `self.group_size`. + /// If it has, we must build an addition level above it. + /// Then check whether the highest level is under `min_level_size`. + /// If it has, we must remove the complete level. + pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn, field_id: u16) -> Result<()> { + let highest_level = get_highest_level(txn, self.db, field_id)?; let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -386,10 +562,44 @@ impl FacetsUpdateIncrementalInner { let size_highest_level = self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?.count(); - if size_highest_level < self.group_size as usize * self.min_level_size as usize { - return Ok(()); + if size_highest_level >= self.group_size as usize * self.min_level_size as usize { + self.add_level(txn, field_id, highest_level, &highest_level_prefix, size_highest_level) + } else if size_highest_level < self.min_level_size as usize && highest_level != 0 { + self.delete_level(txn, &highest_level_prefix) + } else { + Ok(()) } + } + /// Delete a level. + fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> { + let mut to_delete = vec![]; + let mut iter = + self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; + for el in iter.by_ref() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .map_err(Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } + + /// Build an additional level for the field id. + fn add_level( + &self, + txn: &mut RwTxn, + field_id: u16, + highest_level: u8, + highest_level_prefix: &[u8], + size_highest_level: usize, + ) -> Result<()> { let mut groups_iter = self .db .remap_types::() @@ -402,7 +612,7 @@ impl FacetsUpdateIncrementalInner { for _ in 0..nbr_new_groups { let mut first_key = None; let mut values = RoaringBitmap::new(); - for _ in 0..group_size { + for _ in 0..self.group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) .map_err(Error::Encoding)?; @@ -417,7 +627,7 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; - let value = FacetGroupValue { size: group_size, bitmap: values }; + let value = FacetGroupValue { size: self.group_size, bitmap: values }; to_add.push((key.into_owned(), value)); } // now we add the rest of the level, in case its size is > group_size * min_level_size @@ -452,173 +662,6 @@ impl FacetsUpdateIncrementalInner { } Ok(()) } - - /// Delete the given document id from the given facet value in the database, from level 0 to the - /// the given level. - /// - /// ## Return - /// Returns the effect of removing the document id from the database on the given `level`. - /// - /// - `DeletionResult::InPlace` means that deleting the document id did not have - /// an effect on the keys in that level. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// number of keys in the level. For example, removing a document id from the facet value `3` could - /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted - /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must - /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// bounds of the keys of the level. For example, removing a document id from the facet value - /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, - /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). - /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust - /// its left bound as well. - fn delete_in_level( - &self, - txn: &mut RwTxn, - field_id: u16, - level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.delete_in_level_0(txn, field_id, facet_value, docids); - } - let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?; - - let mut decrease_size = false; - let next_key = match result { - DeletionResult::InPlace => { - bitmap.bitmap -= docids; - self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; - return Ok(DeletionResult::InPlace); - } - DeletionResult::Reduce { next } => next, - DeletionResult::Remove { next } => { - decrease_size = true; - next - } - }; - // If either DeletionResult::Reduce or DeletionResult::Remove was returned, - // then we may need to adjust the left_bound of the deletion key. - - // If DeletionResult::Remove was returned, then we need to decrease the group - // size of the deletion key. - let mut updated_value = bitmap; - if decrease_size { - updated_value.size -= 1; - } - - if updated_value.size == 0 { - self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - let mut updated_deletion_key = deletion_key.clone(); - let reduced_range = facet_value == deletion_key.left_bound; - if reduced_range { - updated_deletion_key.left_bound = next_key.clone().unwrap(); - } - updated_value.bitmap -= docids; - let _ = self.db.delete(txn, &deletion_key.as_ref())?; - self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - if reduced_range { - Ok(DeletionResult::Reduce { next: next_key }) - } else { - Ok(DeletionResult::InPlace) - } - } - } - - fn delete_in_level_0( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap; - bitmap -= docids; - - if bitmap.is_empty() { - let mut next_key = None; - if let Some((next, _)) = - self.db.remap_data_type::().get_greater_than(txn, &key)? - { - if next.field_id == field_id && next.level == 0 { - next_key = Some(next.left_bound.to_vec()); - } - } - self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; - Ok(DeletionResult::InPlace) - } - } - - pub fn delete( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if self - .db - .remap_data_type::() - .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? - .is_none() - { - return Ok(()); - } - let highest_level = get_highest_level(txn, self.db, field_id)?; - - let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?; - match result { - DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => return Ok(()), - DeletionResult::Remove { .. } => {} - } - - // if we either removed a key from the highest level, its size may have fallen - // below `min_level_size`, in which case we need to remove the entire level - - let mut highest_level_prefix = vec![]; - highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); - highest_level_prefix.push(highest_level); - - if highest_level == 0 - || self - .db - .remap_types::() - .prefix_iter(txn, &highest_level_prefix)? - .count() - >= self.min_level_size as usize - { - return Ok(()); - } - let mut to_delete = vec![]; - let mut iter = - self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; - for el in iter.by_ref() { - let (k, _) = el?; - to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) - .map_err(Error::Encoding)? - .into_owned(), - ); - } - drop(iter); - for k in to_delete { - self.db.delete(txn, &k.as_ref())?; - } - Ok(()) - } } impl<'a> FacetGroupKey<&'a [u8]> { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ca5a21ce2..15a646836 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -429,7 +429,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn delete_single_docid<'a>( &self, @@ -455,7 +456,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn bulk_insert<'a, 'b>( From a493a50825fd73ef7231fbad08abd308414867fd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 22 Feb 2024 14:53:33 +0100 Subject: [PATCH 20/52] Fix clippy --- milli/src/update/facet/incremental.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index d68540967..584870a7a 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -201,7 +201,7 @@ impl FacetsUpdateIncrementalInner { match (old_value, add_docids, del_docids) { // Addition + deletion on an existing value (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => { - let value = FacetGroupValue { bitmap: bitmap - del_docids | add_docids, size: 1 }; + let value = FacetGroupValue { bitmap: (bitmap - del_docids) | add_docids, size: 1 }; self.db.put(txn, &key, &value)?; Ok(ModificationResult::InPlace) } @@ -575,7 +575,7 @@ impl FacetsUpdateIncrementalInner { fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> { let mut to_delete = vec![]; let mut iter = - self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; + self.db.remap_types::().prefix_iter(txn, highest_level_prefix)?; for el in iter.by_ref() { let (k, _) = el?; to_delete.push( @@ -603,7 +603,7 @@ impl FacetsUpdateIncrementalInner { let mut groups_iter = self .db .remap_types::() - .prefix_iter(txn, &highest_level_prefix)?; + .prefix_iter(txn, highest_level_prefix)?; let nbr_new_groups = size_highest_level / self.group_size as usize; let nbr_leftover_elements = size_highest_level % self.group_size as usize; From 91cdd502f8c616ca3d63b951ee18a5d7ffb20157 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 22 Feb 2024 14:56:22 +0100 Subject: [PATCH 21/52] When processing tasks, make the update file deletion atomic --- Cargo.lock | 10 +++++---- file-store/Cargo.toml | 1 + file-store/src/lib.rs | 16 +++++++++++--- index-scheduler/Cargo.toml | 1 + index-scheduler/src/lib.rs | 44 ++++++++++++++++++++++++++++++-------- 5 files changed, 56 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 684b9e5b5..971ab602a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1728,6 +1728,7 @@ dependencies = [ "faux", "tempfile", "thiserror", + "tracing", "uuid", ] @@ -2393,6 +2394,7 @@ dependencies = [ "meilisearch-types", "page_size 0.5.0", "puffin", + "rayon", "roaring", "serde", "serde_json", @@ -4077,9 +4079,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" dependencies = [ "either", "rayon-core", @@ -4098,9 +4100,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", diff --git a/file-store/Cargo.toml b/file-store/Cargo.toml index 1b1b0cff5..5fae1aab4 100644 --- a/file-store/Cargo.toml +++ b/file-store/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] tempfile = "3.9.0" thiserror = "1.0.56" +tracing = "0.1.40" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index 75db9bb5f..0f2d348ca 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -75,7 +75,13 @@ impl FileStore { /// Returns the file corresponding to the requested uuid. pub fn get_update(&self, uuid: Uuid) -> Result { let path = self.get_update_path(uuid); - let file = StdFile::open(path)?; + let file = match StdFile::open(path) { + Ok(file) => file, + Err(e) => { + tracing::error!("Can't access update file {uuid}: {e}"); + return Err(e.into()); + } + }; Ok(file) } @@ -110,8 +116,12 @@ impl FileStore { pub fn delete(&self, uuid: Uuid) -> Result<()> { let path = self.path.join(uuid.to_string()); - std::fs::remove_file(path)?; - Ok(()) + if let Err(e) = std::fs::remove_file(path) { + tracing::error!("Can't delete file {uuid}: {e}"); + Err(e.into()) + } else { + Ok(()) + } } /// List the Uuids of the files in the FileStore diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 890312854..c758f1114 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -23,6 +23,7 @@ meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } page_size = "0.5.0" puffin = { version = "0.16.0", features = ["serialization"] } +rayon = "1.8.1" roaring = { version = "0.10.2", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 7514a2a68..535b5a36e 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -60,6 +60,7 @@ use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmap use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; +use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use synchronoise::SignalEvent; use time::format_description::well_known::Rfc3339; @@ -1175,6 +1176,9 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchSucceeded); + let mut success = 0; + let mut failure = 0; + #[allow(unused_variables)] for (i, mut task) in tasks.into_iter().enumerate() { task.started_at = Some(started_at); @@ -1187,13 +1191,15 @@ impl IndexScheduler { }, )?; + match task.error { + Some(_) => failure += 1, + None => success += 1, + } + self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!("Failure to delete the content files associated with task {}. Error: {e}", task.uid); - } } - tracing::info!("A batch of tasks was successfully completed."); + tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); } // If we have an abortion error we must stop the tick here and re-schedule tasks. Err(Error::Milli(milli::Error::InternalError( @@ -1204,6 +1210,7 @@ impl IndexScheduler { self.breakpoint(Breakpoint::AbortedIndexation); wtxn.abort(); + tracing::info!("A batch of tasks was aborted."); // We make sure that we don't call `stop_processing` on the `processing_tasks`, // this is because we want to let the next tick call `create_next_batch` and keep // the `started_at` date times and `processings` of the current processing tasks. @@ -1225,6 +1232,8 @@ impl IndexScheduler { self.index_mapper.resize_index(&wtxn, &index_uid)?; wtxn.abort(); + tracing::info!("The max database size was reached. Resizing the index."); + return Ok(TickOutcome::TickAgain(0)); } // In case of a failure we must get back and patch all the tasks with the error. @@ -1232,9 +1241,9 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchFailed); let error: ResponseError = err.into(); - for id in ids { + for id in ids.iter() { let mut task = self - .get_task(&wtxn, id) + .get_task(&wtxn, *id) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? .ok_or(Error::CorruptedTaskQueue)?; task.started_at = Some(started_at); @@ -1246,9 +1255,8 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!("Failure to delete the content files associated with task {}. Error: {e}", task.uid); - } + tracing::info!("Batch failed {}", error); + self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; } @@ -1262,6 +1270,24 @@ impl IndexScheduler { wtxn.commit().map_err(Error::HeedTransaction)?; + // Once the tasks are commited, we should delete all the update files associated ASAP to avoid leaking files in case of a restart + tracing::debug!("Deleting the upadate files"); + + ids.into_par_iter().try_for_each(|id| -> Result<()> { + let rtxn = self.read_txn()?; + let task = self + .get_task(&rtxn, id) + .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? + .ok_or(Error::CorruptedTaskQueue)?; + if let Err(e) = self.delete_persisted_task_data(&task) { + tracing::error!( + "Failure to delete the content files associated with task {}. Error: {e}", + task.uid + ); + } + Ok(()) + })?; + // We shouldn't crash the tick function if we can't send data to the webhook. let _ = self.notify_webhook(&processed); From 55796406c5de89f60f6c103d60daf48c762d1920 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 26 Feb 2024 10:41:47 +0100 Subject: [PATCH 22/52] Add GPU analytics --- meilisearch/src/analytics/segment_analytics.rs | 2 ++ milli/src/vector/mod.rs | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 7e9fff925..865b3df22 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -256,6 +256,7 @@ struct Infos { experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, + gpu_enabled: bool, db_path: bool, import_dump: bool, dump_dir: bool, @@ -342,6 +343,7 @@ impl From for Infos { experimental_logs_mode, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, + gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), dump_dir: dump_dir != PathBuf::from("dumps/"), diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 99b7bff7e..6aa324da9 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -261,3 +261,7 @@ impl DistributionShift { score } } + +pub const fn is_cuda_enabled() -> bool { + cfg!(feature = "cuda") +} From 066a7a3cde2695e7bd8c5d7460e1b500125d9647 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 26 Feb 2024 10:43:04 +0100 Subject: [PATCH 23/52] takes only one read transaction per thread --- index-scheduler/src/batch.rs | 22 +++++++++++++------- index-scheduler/src/lib.rs | 39 +++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 8e2eb26a0..b7e31c136 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -142,22 +142,28 @@ pub(crate) enum IndexOperation { impl Batch { /// Return the task ids associated with this batch. - pub fn ids(&self) -> Vec { + pub fn ids(&self) -> RoaringBitmap { match self { Batch::TaskCancelation { task, .. } | Batch::Dump(task) | Batch::IndexCreation { task, .. } - | Batch::IndexUpdate { task, .. } => vec![task.uid], + | Batch::IndexUpdate { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } Batch::SnapshotCreation(tasks) | Batch::TaskDeletions(tasks) - | Batch::IndexDeletion { tasks, .. } => tasks.iter().map(|task| task.uid).collect(), + | Batch::IndexDeletion { tasks, .. } => { + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } Batch::IndexOperation { op, .. } => match op { IndexOperation::DocumentOperation { tasks, .. } | IndexOperation::Settings { tasks, .. } | IndexOperation::DocumentClear { tasks, .. } => { - tasks.iter().map(|task| task.uid).collect() + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } + IndexOperation::IndexDocumentDeletionByFilter { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } - IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid], IndexOperation::SettingsAndDocumentOperation { document_import_tasks: tasks, settings_tasks: other, @@ -167,9 +173,11 @@ impl Batch { cleared_tasks: tasks, settings_tasks: other, .. - } => tasks.iter().chain(other).map(|task| task.uid).collect(), + } => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)), }, - Batch::IndexSwap { task } => vec![task.uid], + Batch::IndexSwap { task } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 535b5a36e..38a999ad7 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -37,8 +37,8 @@ use std::fs::File; use std::io::{self, BufReader, Read}; use std::ops::{Bound, RangeBounds}; use std::path::{Path, PathBuf}; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering::Relaxed; +use std::sync::atomic::Ordering::{self, Relaxed}; +use std::sync::atomic::{AtomicBool, AtomicU32}; use std::sync::{Arc, RwLock}; use std::time::Duration; @@ -60,6 +60,7 @@ use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmap use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use puffin::FrameView; +use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; use synchronoise::SignalEvent; @@ -1139,15 +1140,13 @@ impl IndexScheduler { drop(rtxn); // 1. store the starting date with the bitmap of processing tasks. - let mut ids = batch.ids(); - ids.sort_unstable(); + let ids = batch.ids(); let processed_tasks = ids.len(); - let processing_tasks = RoaringBitmap::from_sorted_iter(ids.iter().copied()).unwrap(); let started_at = OffsetDateTime::now_utc(); // We reset the must_stop flag to be sure that we don't stop processing tasks self.must_stop_processing.reset(); - self.processing_tasks.write().unwrap().start_processing_at(started_at, processing_tasks); + self.processing_tasks.write().unwrap().start_processing_at(started_at, ids.clone()); #[cfg(test)] self.breakpoint(Breakpoint::BatchCreated); @@ -1243,7 +1242,7 @@ impl IndexScheduler { let error: ResponseError = err.into(); for id in ids.iter() { let mut task = self - .get_task(&wtxn, *id) + .get_task(&wtxn, id) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? .ok_or(Error::CorruptedTaskQueue)?; task.started_at = Some(started_at); @@ -1273,17 +1272,21 @@ impl IndexScheduler { // Once the tasks are commited, we should delete all the update files associated ASAP to avoid leaking files in case of a restart tracing::debug!("Deleting the upadate files"); - ids.into_par_iter().try_for_each(|id| -> Result<()> { + //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap + let idx = AtomicU32::new(0); + (0..current_num_threads()).into_par_iter().try_for_each(|_| -> Result<()> { let rtxn = self.read_txn()?; - let task = self - .get_task(&rtxn, id) - .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? - .ok_or(Error::CorruptedTaskQueue)?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!( - "Failure to delete the content files associated with task {}. Error: {e}", - task.uid - ); + while let Some(id) = ids.select(idx.fetch_add(1, Ordering::Relaxed)) { + let task = self + .get_task(&rtxn, id) + .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? + .ok_or(Error::CorruptedTaskQueue)?; + if let Err(e) = self.delete_persisted_task_data(&task) { + tracing::error!( + "Failure to delete the content files associated with task {}. Error: {e}", + task.uid + ); + } } Ok(()) })?; @@ -1696,7 +1699,7 @@ pub enum TickOutcome { /// The scheduler should immediately attempt another `tick`. /// /// The `usize` field contains the number of processed tasks. - TickAgain(usize), + TickAgain(u64), /// The scheduler should wait for an external signal before attempting another `tick`. WaitForSignal, } From eb25b0739028b306d90a4b7af42670f7092c16b0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Sep 2023 11:16:51 +0200 Subject: [PATCH 24/52] let you specify your task id --- index-scheduler/src/error.rs | 4 + index-scheduler/src/lib.rs | 815 ++++++++++++-------- meilisearch/src/lib.rs | 2 +- meilisearch/src/routes/dump.rs | 5 +- meilisearch/src/routes/indexes/documents.rs | 25 +- meilisearch/src/routes/indexes/mod.rs | 12 +- meilisearch/src/routes/indexes/settings.rs | 18 +- meilisearch/src/routes/mod.rs | 30 +- meilisearch/src/routes/snapshot.rs | 5 +- meilisearch/src/routes/swap_indexes.rs | 5 +- meilisearch/src/routes/tasks.rs | 9 +- meilisearch/tests/index/create_index.rs | 71 ++ 12 files changed, 655 insertions(+), 346 deletions(-) diff --git a/index-scheduler/src/error.rs b/index-scheduler/src/error.rs index bbe526460..223b84762 100644 --- a/index-scheduler/src/error.rs +++ b/index-scheduler/src/error.rs @@ -48,6 +48,8 @@ impl From for Code { pub enum Error { #[error("{1}")] WithCustomErrorCode(Code, Box), + #[error("Received bad task id: {received} should be >= to {expected}.")] + BadTaskId { received: TaskId, expected: TaskId }, #[error("Index `{0}` not found.")] IndexNotFound(String), #[error("Index `{0}` already exists.")] @@ -161,6 +163,7 @@ impl Error { match self { Error::IndexNotFound(_) | Error::WithCustomErrorCode(_, _) + | Error::BadTaskId { .. } | Error::IndexAlreadyExists(_) | Error::SwapDuplicateIndexFound(_) | Error::SwapDuplicateIndexesFound(_) @@ -205,6 +208,7 @@ impl ErrorCode for Error { fn error_code(&self) -> Code { match self { Error::WithCustomErrorCode(code, _) => *code, + Error::BadTaskId { .. } => Code::BadRequest, Error::IndexNotFound(_) => Code::IndexNotFound, Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists, Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 7514a2a68..b1edaabe5 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -993,7 +993,7 @@ impl IndexScheduler { /// Register a new task in the scheduler. /// /// If it fails and data was associated with the task, it tries to delete the associated data. - pub fn register(&self, kind: KindWithContent) -> Result { + pub fn register(&self, kind: KindWithContent, task_id: Option) -> Result { let mut wtxn = self.env.write_txn()?; // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task @@ -1003,8 +1003,16 @@ impl IndexScheduler { return Err(Error::NoSpaceLeftInTaskQueue); } + let next_task_id = self.next_task_id(&wtxn)?; + + if let Some(uid) = task_id { + if uid < next_task_id { + return Err(Error::BadTaskId { received: uid, expected: next_task_id }); + } + } + let mut task = Task { - uid: self.next_task_id(&wtxn)?, + uid: task_id.unwrap_or(next_task_id), enqueued_at: OffsetDateTime::now_utc(), started_at: None, finished_at: None, @@ -1386,13 +1394,16 @@ impl IndexScheduler { // increase time by one nanosecond so that the enqueuedAt of the last task to delete is also lower than that date. let delete_before = last_task_to_delete.enqueued_at + Duration::from_nanos(1); - self.register(KindWithContent::TaskDeletion { - query: format!( - "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", - delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, - ), - tasks: to_delete, - })?; + self.register( + KindWithContent::TaskDeletion { + query: format!( + "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", + delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, + ), + tasks: to_delete, + }, + None, + )?; Ok(()) } @@ -2016,7 +2027,7 @@ mod tests { for (idx, kind) in kinds.into_iter().enumerate() { let k = kind.as_kind(); - let task = index_scheduler.register(kind).unwrap(); + let task = index_scheduler.register(kind, None).unwrap(); index_scheduler.assert_internally_consistent(); assert_eq!(task.uid, idx as u32); @@ -2031,18 +2042,18 @@ mod tests { fn insert_task_while_another_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); // while the task is processing can we register another task? - index_scheduler.register(index_creation_task("index_b", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_b", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }) + .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); } @@ -2051,7 +2062,7 @@ mod tests { fn test_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id")).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); handle.advance_till([Start, BatchCreated]); @@ -2065,17 +2076,23 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2094,22 +2111,25 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); @@ -2142,7 +2162,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2151,10 +2171,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1]), + }, + None, + ) .unwrap(); // again, no progress made at all, but one more task is registered snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_enqueued"); @@ -2188,7 +2211,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2199,10 +2222,13 @@ mod tests { // Now we delete the first task index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); @@ -2225,7 +2251,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2237,10 +2263,13 @@ mod tests { // Now we delete the first task multiple times in a row for _ in 0..2 { index_scheduler - .register(KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2263,14 +2292,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); @@ -2292,7 +2324,10 @@ mod tests { }"#; index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2300,19 +2335,22 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2336,21 +2374,27 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }) + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2373,10 +2417,13 @@ mod tests { fn document_deletion_and_document_addition() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); index_scheduler - .register(KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }) + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2390,14 +2437,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2428,17 +2478,20 @@ mod tests { for name in index_names { index_scheduler - .register(KindWithContent::IndexCreation { - index_uid: name.to_string(), - primary_key: None, - }) + .register( + KindWithContent::IndexCreation { + index_uid: name.to_string(), + primary_key: None, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } for name in index_names { index_scheduler - .register(KindWithContent::DocumentClear { index_uid: name.to_string() }) + .register(KindWithContent::DocumentClear { index_uid: name.to_string() }, None) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2463,7 +2516,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2477,18 +2530,24 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_d"); index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, + ], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); @@ -2498,7 +2557,7 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); - index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }).unwrap(); + index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }, None).unwrap(); handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); } @@ -2515,7 +2574,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_n_successful_batches(4); @@ -2525,12 +2584,15 @@ mod tests { snapshot!(first_snap, name: "initial_tasks_processed"); let err = index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, + ], + }, + None, + ) .unwrap_err(); snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); @@ -2539,13 +2601,16 @@ mod tests { // Index `e` does not exist, but we don't check its existence yet index_scheduler - .register(KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, - IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, - ], - }) + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, + IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, + ], + }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); // Now the first swap should have an error message saying `e` and `f` do not exist @@ -2566,17 +2631,20 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler)); @@ -2601,7 +2669,7 @@ mod tests { }, ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2618,7 +2686,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0)) + .register(replace_document_import_task("catto", None, 0, documents_count0), None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2626,10 +2694,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processed"); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); @@ -2644,7 +2715,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0)) + .register(replace_document_import_task("catto", None, 0, documents_count0), None) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2652,10 +2723,13 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processing"); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_task_registered"); @@ -2685,7 +2759,7 @@ mod tests { replace_document_import_task("wolfo", None, 2, documents_count2), ]; for task in to_enqueue { - let _ = index_scheduler.register(task).unwrap(); + let _ = index_scheduler.register(task, None).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_one_successful_batch(); @@ -2693,10 +2767,13 @@ mod tests { handle.advance_till([Start, BatchCreated, InsideProcessBatch]); index_scheduler - .register(KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1, 2]), - }) + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1, 2]), + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); @@ -2724,14 +2801,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2771,14 +2851,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2820,14 +2903,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2870,14 +2956,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2921,14 +3010,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2973,13 +3065,13 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); handle.advance_n_successful_batches(3); @@ -3037,11 +3129,11 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3260,17 +3352,17 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3346,20 +3438,20 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind).unwrap(); + let _ = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind).unwrap(); + let _ = index_scheduler.register(kind, None).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); handle.advance_n_successful_batches(1); let kind = KindWithContent::TaskCancelation { query: "test_query".to_string(), tasks: [0, 1, 2, 3].into_iter().collect(), }; - let task_cancelation = index_scheduler.register(kind).unwrap(); + let task_cancelation = index_scheduler.register(kind, None).unwrap(); handle.advance_n_successful_batches(1); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3394,7 +3486,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); handle.advance_one_failed_batch(); @@ -3419,14 +3511,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); @@ -3457,14 +3552,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3513,14 +3611,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3561,14 +3662,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3596,7 +3700,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3615,14 +3722,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3655,7 +3765,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3674,14 +3787,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3718,7 +3834,10 @@ mod tests { // Create the index. index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_one_successful_batch(); @@ -3738,14 +3857,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3791,14 +3913,17 @@ mod tests { let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3843,14 +3968,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3904,14 +4032,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -3961,14 +4092,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4042,14 +4176,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4125,14 +4262,17 @@ mod tests { file.persist().unwrap(); index_scheduler - .register(KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }) + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -4186,7 +4326,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); @@ -4206,15 +4346,18 @@ mod tests { }); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); // on average this task takes ~600 bytes loop { - let result = index_scheduler.register(KindWithContent::IndexCreation { - index_uid: S("doggo"), - primary_key: None, - }); + let result = index_scheduler.register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ); if result.is_err() { break; } @@ -4224,7 +4367,10 @@ mod tests { // at this point the task DB shoud have reached its limit and we should not be able to register new tasks let result = index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code @@ -4232,10 +4378,10 @@ mod tests { // Even the task deletion that doesn't delete anything shouldn't be accepted let result = index_scheduler - .register(KindWithContent::TaskDeletion { - query: S("test"), - tasks: RoaringBitmap::new(), - }) + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, + None, + ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code @@ -4243,13 +4389,19 @@ mod tests { // But a task deletion that delete something should works index_scheduler - .register(KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }) + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); // Now we should be able to enqueue a few tasks again index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); } @@ -4262,22 +4414,34 @@ mod tests { }); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_successful_batch(); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); handle.advance_one_failed_batch(); // at this point the max number of tasks is reached // we can still enqueue multiple tasks index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); index_scheduler - .register(KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }) + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) .unwrap(); let rtxn = index_scheduler.env.read_txn().unwrap(); @@ -4325,11 +4489,11 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind).unwrap(); + let _task = index_scheduler.register(kind, None).unwrap(); snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" { @@ -4479,11 +4643,11 @@ mod tests { query: "cancel dump".to_owned(), tasks: RoaringBitmap::from_iter([0]), }; - let _ = index_scheduler.register(dump_creation).unwrap(); + let _ = index_scheduler.register(dump_creation, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - let _ = index_scheduler.register(dump_cancellation).unwrap(); + let _ = index_scheduler.register(dump_cancellation, None).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); @@ -4491,4 +4655,21 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); } + + #[test] + fn basic_set_taskid() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None).unwrap(); + snapshot!(task.uid, @"0"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12)).unwrap(); + snapshot!(task.uid, @"12"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let error = index_scheduler.register(kind, Some(5)).unwrap_err(); + snapshot!(error, @"Received bad task id: 5 should be >= to 13."); + } } diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 01ca63857..b91edaf01 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -265,7 +265,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< .name(String::from("register-snapshot-tasks")) .spawn(move || loop { thread::sleep(snapshot_delay); - if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation) { + if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation, None) { error!("Error while registering snapshot: {}", e); } }) diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 071ae60b8..8f44070d8 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -11,7 +11,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); @@ -29,8 +29,9 @@ pub async fn create_dump( keys: auth_controller.list_keys()?, instance_uid: analytics.instance_uid().cloned(), }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create dump"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 1f41fa10c..492f039cf 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -7,7 +7,7 @@ use bstr::ByteSlice as _; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::Deserr; use futures::StreamExt; -use index_scheduler::IndexScheduler; +use index_scheduler::{IndexScheduler, TaskId}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType}; @@ -36,7 +36,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::search::parse_filter; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -130,9 +130,10 @@ pub async fn delete_document( index_uid: index_uid.to_string(), documents_ids: vec![document_id], }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); - debug!(returns = ?task, "Delete document"); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } @@ -277,6 +278,7 @@ pub async fn replace_documents( analytics.add_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); + let uid = get_task_id(&req)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -285,6 +287,7 @@ pub async fn replace_documents( params.csv_delimiter, body, IndexDocumentsMethod::ReplaceDocuments, + uid, allow_index_creation, ) .await?; @@ -309,6 +312,7 @@ pub async fn update_documents( analytics.update_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); + let uid = get_task_id(&req)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -317,6 +321,7 @@ pub async fn update_documents( params.csv_delimiter, body, IndexDocumentsMethod::UpdateDocuments, + uid, allow_index_creation, ) .await?; @@ -334,6 +339,7 @@ async fn document_addition( csv_delimiter: Option, mut body: Payload, method: IndexDocumentsMethod, + task_id: Option, allow_index_creation: bool, ) -> Result { let format = match ( @@ -450,7 +456,7 @@ async fn document_addition( }; let scheduler = index_scheduler.clone(); - let task = match tokio::task::spawn_blocking(move || scheduler.register(task)).await? { + let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id)).await? { Ok(task) => task, Err(e) => { index_scheduler.delete_update_file(uuid)?; @@ -480,8 +486,9 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete documents by batch"); Ok(HttpResponse::Accepted().json(task)) @@ -516,8 +523,9 @@ pub async fn delete_documents_by_filter( .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?; let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete documents by filter"); Ok(HttpResponse::Accepted().json(task)) @@ -533,8 +541,9 @@ pub async fn clear_all_documents( analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete all documents"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index d80bd9c61..6451d930d 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -17,7 +17,7 @@ use serde_json::json; use time::OffsetDateTime; use tracing::debug; -use super::{Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -137,8 +137,9 @@ pub async fn create_index( ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create index"); Ok(HttpResponse::Accepted().json(task)) @@ -206,8 +207,9 @@ pub async fn update_index( primary_key: body.primary_key, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Update index"); Ok(HttpResponse::Accepted().json(task)) @@ -216,11 +218,13 @@ pub async fn update_index( pub async fn delete_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete index"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 23e8925c7..9fbd84161 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -15,7 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; #[macro_export] macro_rules! make_setting_route { @@ -34,7 +34,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::policies::*; use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; - use $crate::routes::SummarizedTaskView; + use $crate::routes::{get_task_id, SummarizedTaskView}; pub async fn delete( index_scheduler: GuardedData< @@ -42,6 +42,7 @@ macro_rules! make_setting_route { Data, >, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -56,8 +57,9 @@ macro_rules! make_setting_route { is_deletion: true, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? .into(); @@ -105,8 +107,9 @@ macro_rules! make_setting_route { is_deletion: false, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? .into(); @@ -767,8 +770,9 @@ pub async fn update_all( is_deletion: false, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Update all settings"); Ok(HttpResponse::Accepted().json(task)) @@ -790,6 +794,7 @@ pub async fn get_all( pub async fn delete_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, + req: HttpRequest, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -803,8 +808,9 @@ pub async fn delete_all( is_deletion: true, allow_index_creation, }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete all settings"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 89cf63c50..61a9f3352 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -4,7 +4,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; -use meilisearch_types::error::ResponseError; +use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; use serde::{Deserialize, Serialize}; @@ -45,6 +45,34 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/experimental-features").configure(features::configure)); } +pub fn get_task_id(req: &HttpRequest) -> Result, ResponseError> { + let task_id = req + .headers() + .get("TaskId") + .map(|header| { + header.to_str().map_err(|e| { + ResponseError::from_msg( + format!("TaskId is not a valid utf-8 string: {e}"), + Code::BadRequest, + ) + }) + }) + .transpose()? + .map(|s| { + s.parse::().map_err(|e| { + ResponseError::from_msg( + format!( + "Could not parse the TaskId as a {}: {e}", + std::any::type_name::(), + ), + Code::BadRequest, + ) + }) + }) + .transpose()?; + Ok(task_id) +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct SummarizedTaskView { diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index c94529932..28dbac85f 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -10,7 +10,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::SummarizedTaskView; +use crate::routes::{get_task_id, SummarizedTaskView}; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); @@ -24,8 +24,9 @@ pub async fn create_snapshot( analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); let task = KindWithContent::SnapshotCreation; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create snapshot"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 79e619705..64268dbfa 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -10,7 +10,7 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde_json::json; -use super::SummarizedTaskView; +use super::{get_task_id, SummarizedTaskView}; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -60,7 +60,8 @@ pub async fn swap_indexes( } let task = KindWithContent::IndexSwap { swaps }; + let uid = get_task_id(&req)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); Ok(HttpResponse::Accepted().json(task)) } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 03b63001d..26e1c43f8 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -18,7 +18,7 @@ use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; -use super::SummarizedTaskView; +use super::{get_task_id, SummarizedTaskView}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; @@ -197,7 +197,9 @@ async fn cancel_tasks( let task_cancelation = KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; - let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation)).await??; + let uid = get_task_id(&req)?; + let task = + task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) @@ -242,7 +244,8 @@ async fn delete_tasks( let task_deletion = KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; - let task = task::spawn_blocking(move || index_scheduler.register(task_deletion)).await??; + let uid = get_task_id(&req)?; + let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index 7ce56d440..b9f755f35 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -199,3 +199,74 @@ async fn error_create_with_invalid_index_uid() { } "###); } + +#[actix_rt::test] +async fn send_task_id() { + let server = Server::new().await; + let app = server.init_web_app().await; + let index = server.index("catto"); + let (response, code) = index.create(None).await; + snapshot!(code, @"202 Accepted"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 0, + "indexUid": "catto", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "[date]" + } + "###); + + let body = serde_json::to_string(&json!({ + "uid": "doggo", + "primaryKey": None::<&str>, + })) + .unwrap(); + let req = test::TestRequest::post() + .uri("/indexes") + .insert_header(("TaskId", "25")) + .insert_header(ContentType::json()) + .set_payload(body) + .to_request(); + + let res = test::call_service(&app, req).await; + snapshot!(res.status(), @"202 Accepted"); + + let bytes = test::read_body(res).await; + let response = serde_json::from_slice::(&bytes).expect("Expecting valid json"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 25, + "indexUid": "doggo", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "[date]" + } + "###); + + let body = serde_json::to_string(&json!({ + "uid": "girafo", + "primaryKey": None::<&str>, + })) + .unwrap(); + let req = test::TestRequest::post() + .uri("/indexes") + .insert_header(("TaskId", "12")) + .insert_header(ContentType::json()) + .set_payload(body) + .to_request(); + + let res = test::call_service(&app, req).await; + snapshot!(res.status(), @"400 Bad Request"); + + let bytes = test::read_body(res).await; + let response = serde_json::from_slice::(&bytes).expect("Expecting valid json"); + snapshot!(json_string!(response), @r###" + { + "message": "Received bad task id: 12 should be >= to 26.", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); +} From 507739bd9893ab30d8e8e6a63364c0aa1ccb4580 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 20 Feb 2024 11:24:44 +0100 Subject: [PATCH 25/52] add an experimental cli parameter to allow specifying your task id --- .../src/analytics/segment_analytics.rs | 11 +++++++++++ .../src/extractors/sequential_extractor.rs | 1 + meilisearch/src/lib.rs | 1 + meilisearch/src/option.rs | 17 +++++++++++++++++ meilisearch/src/routes/dump.rs | 4 +++- meilisearch/src/routes/indexes/documents.rs | 19 +++++++++++++------ meilisearch/src/routes/indexes/mod.rs | 10 +++++++--- meilisearch/src/routes/indexes/settings.rs | 14 ++++++++++---- meilisearch/src/routes/mod.rs | 6 +++++- meilisearch/src/routes/snapshot.rs | 4 +++- meilisearch/src/routes/swap_indexes.rs | 4 +++- meilisearch/src/routes/tasks.rs | 7 +++++-- meilisearch/tests/index/create_index.rs | 9 +++++++-- 13 files changed, 86 insertions(+), 21 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 7e9fff925..55dd02460 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -253,6 +253,7 @@ struct Infos { env: String, experimental_enable_metrics: bool, experimental_logs_mode: LogMode, + experimental_ha_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, @@ -291,7 +292,12 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, +<<<<<<< HEAD experimental_logs_mode, +||||||| parent of 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) +======= + experimental_ha_parameters, +>>>>>>> 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, @@ -339,7 +345,12 @@ impl From for Infos { Self { env, experimental_enable_metrics, +<<<<<<< HEAD experimental_logs_mode, +||||||| parent of 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) +======= + experimental_ha_parameters, +>>>>>>> 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, db_path: db_path != PathBuf::from("./data.ms"), diff --git a/meilisearch/src/extractors/sequential_extractor.rs b/meilisearch/src/extractors/sequential_extractor.rs index c04210616..23d6cb997 100644 --- a/meilisearch/src/extractors/sequential_extractor.rs +++ b/meilisearch/src/extractors/sequential_extractor.rs @@ -131,6 +131,7 @@ gen_seq! { SeqFromRequestFut3; A B C } gen_seq! { SeqFromRequestFut4; A B C D } gen_seq! { SeqFromRequestFut5; A B C D E } gen_seq! { SeqFromRequestFut6; A B C D E F } +gen_seq! { SeqFromRequestFut7; A B C D E F G } pin_project! { #[project = ExtractProj] diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index b91edaf01..a6a0f0d77 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -468,6 +468,7 @@ pub fn configure_data( .app_data(web::Data::from(analytics)) .app_data(web::Data::new(logs_route)) .app_data(web::Data::new(logs_stderr)) + .app_data(web::Data::new(opt.clone())) .app_data( web::JsonConfig::default() .limit(http_payload_size_limit) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index cd99bf452..58f3791e8 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -52,6 +52,7 @@ const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE"; +const MEILI_EXPERIMENTAL_HA_PARAMETERS: &str = "MEILI_EXPERIMENTAL_HA_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = @@ -358,6 +359,17 @@ pub struct Opt { #[serde(default)] pub experimental_enable_logs_route: bool, + /// Enable multiple features that helps you to run meilisearch in a high availability context. + /// TODO: TAMO: Update the discussion link + /// For more information, see: + /// + /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now + /// - Lets you specify a custom task ID upon registering a task + /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed) + #[clap(long, env = MEILI_EXPERIMENTAL_HA_PARAMETERS)] + #[serde(default)] + pub experimental_ha_parameters: bool, + /// Experimental RAM reduction during indexing, do not use in production, see: #[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)] #[serde(default)] @@ -465,6 +477,7 @@ impl Opt { experimental_enable_metrics, experimental_logs_mode, experimental_enable_logs_route, + experimental_ha_parameters, experimental_reduce_indexing_memory_usage, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); @@ -525,6 +538,10 @@ impl Opt { MEILI_EXPERIMENTAL_LOGS_MODE, experimental_logs_mode.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_HA_PARAMETERS, + experimental_ha_parameters.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE, experimental_enable_logs_route.to_string(), diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 8f44070d8..56231a759 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -12,6 +12,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); @@ -21,6 +22,7 @@ pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { analytics.publish("Dump Created".to_string(), json!({}), Some(&req)); @@ -29,7 +31,7 @@ pub async fn create_dump( keys: auth_controller.list_keys()?, instance_uid: analytics.instance_uid().cloned(), }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 492f039cf..5bf7eaa8d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -38,6 +38,7 @@ use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::search::parse_filter; +use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -119,6 +120,7 @@ pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = path.into_inner(); @@ -130,7 +132,7 @@ pub async fn delete_document( index_uid: index_uid.to_string(), documents_ids: vec![document_id], }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!("returns: {:?}", task); @@ -268,6 +270,7 @@ pub async fn replace_documents( params: AwebQueryParameter, body: Payload, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -278,7 +281,7 @@ pub async fn replace_documents( analytics.add_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -302,6 +305,7 @@ pub async fn update_documents( params: AwebQueryParameter, body: Payload, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -312,7 +316,7 @@ pub async fn update_documents( analytics.update_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -472,6 +476,7 @@ pub async fn delete_documents_batch( index_uid: web::Path, body: web::Json>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by batch"); @@ -486,7 +491,7 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -506,6 +511,7 @@ pub async fn delete_documents_by_filter( index_uid: web::Path, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by filter"); @@ -523,7 +529,7 @@ pub async fn delete_documents_by_filter( .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?; let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -535,13 +541,14 @@ pub async fn clear_all_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 6451d930d..59a1f0e64 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -22,6 +22,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; pub mod documents; pub mod facet_search; @@ -123,6 +124,7 @@ pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Create index"); @@ -137,7 +139,7 @@ pub async fn create_index( ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Create index"); @@ -191,6 +193,7 @@ pub async fn update_index( index_uid: web::Path, body: AwebJson, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Update index"); @@ -207,7 +210,7 @@ pub async fn update_index( primary_key: body.primary_key, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -219,10 +222,11 @@ pub async fn delete_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); debug!(returns = ?task, "Delete index"); diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 9fbd84161..6e43bce41 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -16,6 +16,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; #[macro_export] macro_rules! make_setting_route { @@ -34,6 +35,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::policies::*; use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; + use $crate::Opt; use $crate::routes::{get_task_id, SummarizedTaskView}; pub async fn delete( @@ -43,6 +45,7 @@ macro_rules! make_setting_route { >, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -57,7 +60,7 @@ macro_rules! make_setting_route { is_deletion: true, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? @@ -75,6 +78,7 @@ macro_rules! make_setting_route { index_uid: actix_web::web::Path, body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, + opt: web::Data, $analytics_var: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -107,7 +111,7 @@ macro_rules! make_setting_route { is_deletion: false, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) .await?? @@ -655,6 +659,7 @@ pub async fn update_all( index_uid: web::Path, body: AwebJson, DeserrJsonError>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -770,7 +775,7 @@ pub async fn update_all( is_deletion: false, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); @@ -795,6 +800,7 @@ pub async fn delete_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, req: HttpRequest, + opt: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -808,7 +814,7 @@ pub async fn delete_all( is_deletion: true, allow_index_creation, }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 61a9f3352..2dc89b150 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -15,6 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; +use crate::Opt; const PAGINATION_DEFAULT_LIMIT: usize = 20; @@ -45,7 +46,10 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/experimental-features").configure(features::configure)); } -pub fn get_task_id(req: &HttpRequest) -> Result, ResponseError> { +pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, ResponseError> { + if !opt.experimental_ha_parameters { + return Ok(None); + } let task_id = req .headers() .get("TaskId") diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index 28dbac85f..6b3178126 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -11,6 +11,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); @@ -19,12 +20,13 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); let task = KindWithContent::SnapshotCreation; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 64268dbfa..f8adeeb18 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -16,6 +16,7 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(swap_indexes)))); @@ -32,6 +33,7 @@ pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -60,7 +62,7 @@ pub async fn swap_indexes( } let task = KindWithContent::IndexSwap { swaps }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task: SummarizedTaskView = tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 26e1c43f8..279b57e3d 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -23,6 +23,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::Opt; const DEFAULT_LIMIT: u32 = 20; @@ -161,6 +162,7 @@ async fn cancel_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -197,7 +199,7 @@ async fn cancel_tasks( let task_cancelation = KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; let task: SummarizedTaskView = task.into(); @@ -209,6 +211,7 @@ async fn delete_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, + opt: web::Data, analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -244,7 +247,7 @@ async fn delete_tasks( let task_deletion = KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; - let uid = get_task_id(&req)?; + let uid = get_task_id(&req, &opt)?; let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; let task: SummarizedTaskView = task.into(); diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index b9f755f35..7a678624c 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -2,9 +2,10 @@ use actix_web::http::header::ContentType; use actix_web::test; use http::header::ACCEPT_ENCODING; use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; use crate::common::encoder::Encoder; -use crate::common::{Server, Value}; +use crate::common::{default_settings, Server, Value}; use crate::json; #[actix_rt::test] @@ -202,7 +203,11 @@ async fn error_create_with_invalid_index_uid() { #[actix_rt::test] async fn send_task_id() { - let server = Server::new().await; + let temp = tempfile::tempdir().unwrap(); + + let options = Opt { experimental_ha_parameters: true, ..default_settings(temp.path()) }; + let server = Server::new_with_options(options).await.unwrap(); + let app = server.init_web_app().await; let index = server.index("catto"); let (response, code) = index.create(None).await; From 1eb1c043b59dce623012a79cc2e2c6fcceb0cade Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 20 Feb 2024 12:16:50 +0100 Subject: [PATCH 26/52] disable the auto deletion of tasks when the ha mode is enabled --- index-scheduler/src/insta_snapshot.rs | 1 + index-scheduler/src/lib.rs | 68 +++++++++++++- .../task_deletion_have_not_been_enqueued.snap | 90 +++++++++++++++++++ .../task_queue_is_full.snap | 90 +++++++++++++++++++ meilisearch/src/lib.rs | 1 + 5 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap diff --git a/index-scheduler/src/insta_snapshot.rs b/index-scheduler/src/insta_snapshot.rs index 42f041578..988e75b81 100644 --- a/index-scheduler/src/insta_snapshot.rs +++ b/index-scheduler/src/insta_snapshot.rs @@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let IndexScheduler { autobatching_enabled, + cleanup_enabled: _, must_stop_processing: _, processing_tasks, file_store, diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index b1edaabe5..9a1799469 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -264,6 +264,9 @@ pub struct IndexSchedulerOptions { /// Set to `true` iff the index scheduler is allowed to automatically /// batch tasks together, to process multiple tasks at once. pub autobatching_enabled: bool, + /// Set to `true` iff the index scheduler is allowed to automatically + /// delete the finished tasks when there are too many tasks. + pub cleanup_enabled: bool, /// The maximum number of tasks stored in the task queue before starting /// to auto schedule task deletions. pub max_number_of_tasks: usize, @@ -324,6 +327,9 @@ pub struct IndexScheduler { /// Whether auto-batching is enabled or not. pub(crate) autobatching_enabled: bool, + /// Whether we should automatically cleanup the task queue or not. + pub(crate) cleanup_enabled: bool, + /// The max number of tasks allowed before the scheduler starts to delete /// the finished tasks automatically. pub(crate) max_number_of_tasks: usize, @@ -390,6 +396,7 @@ impl IndexScheduler { index_mapper: self.index_mapper.clone(), wake_up: self.wake_up.clone(), autobatching_enabled: self.autobatching_enabled, + cleanup_enabled: self.cleanup_enabled, max_number_of_tasks: self.max_number_of_tasks, max_number_of_batched_tasks: self.max_number_of_batched_tasks, puffin_frame: self.puffin_frame.clone(), @@ -491,6 +498,7 @@ impl IndexScheduler { wake_up: Arc::new(SignalEvent::auto(true)), puffin_frame: Arc::new(puffin::GlobalFrameView::default()), autobatching_enabled: options.autobatching_enabled, + cleanup_enabled: options.cleanup_enabled, max_number_of_tasks: options.max_number_of_tasks, max_number_of_batched_tasks: options.max_number_of_batched_tasks, dumps_path: options.dumps_path, @@ -1134,7 +1142,9 @@ impl IndexScheduler { self.breakpoint(Breakpoint::Start); } - self.cleanup_task_queue()?; + if self.cleanup_enabled { + self.cleanup_task_queue()?; + } let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; let batch = @@ -1781,6 +1791,7 @@ mod tests { index_count: 5, indexer_config, autobatching_enabled: true, + cleanup_enabled: true, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: usize::MAX, instance_features: Default::default(), @@ -4484,6 +4495,61 @@ mod tests { drop(rtxn); } + #[test] + fn test_disable_auto_deletion_of_tasks() { + let (index_scheduler, mut handle) = + IndexScheduler::test_with_custom_config(vec![], |config| { + config.cleanup_enabled = false; + config.max_number_of_tasks = 2; + }); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + handle.advance_one_failed_batch(); + + // at this point the max number of tasks is reached + // we can still enqueue multiple tasks + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + ) + .unwrap(); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); + let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); + drop(rtxn); + + // now we're above the max number of tasks + // and if we try to advance in the tick function no new task deletion should be enqueued + handle.advance_till([Start, BatchCreated]); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); + let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_not_been_enqueued"); + drop(rtxn); + } + #[test] fn basic_get_stats() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); diff --git a/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap new file mode 100644 index 000000000..988df76ec --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap @@ -0,0 +1,90 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "uid": 0, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "succeeded", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 1, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": { + "message": "Index `doggo` already exists.", + "code": "index_already_exists", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_already_exists" + }, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "failed", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 2, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 3, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + } +] diff --git a/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap new file mode 100644 index 000000000..988df76ec --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap @@ -0,0 +1,90 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "uid": 0, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "succeeded", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 1, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": { + "message": "Index `doggo` already exists.", + "code": "index_already_exists", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_already_exists" + }, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "failed", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 2, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + }, + { + "uid": 3, + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]", + "error": null, + "canceledBy": null, + "details": { + "IndexInfo": { + "primary_key": null + } + }, + "status": "enqueued", + "kind": { + "indexCreation": { + "index_uid": "doggo", + "primary_key": null + } + } + } +] diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index a6a0f0d77..292a87259 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -300,6 +300,7 @@ fn open_or_create_database_unchecked( enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, indexer_config: (&opt.indexer_options).try_into()?, autobatching_enabled: true, + cleanup_enabled: !opt.experimental_ha_parameters, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, From 36c27a18a1c45f2069fc8d0fbec4d48a49dfa447 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 11:21:26 +0100 Subject: [PATCH 27/52] implement the dry run ha parameter --- file-store/src/lib.rs | 22 +- index-scheduler/src/lib.rs | 249 ++++++++++++++++---- meilisearch/src/lib.rs | 4 +- meilisearch/src/routes/dump.rs | 7 +- meilisearch/src/routes/indexes/documents.rs | 35 ++- meilisearch/src/routes/indexes/mod.rs | 16 +- meilisearch/src/routes/indexes/settings.rs | 20 +- meilisearch/src/routes/mod.rs | 19 ++ meilisearch/src/routes/snapshot.rs | 7 +- meilisearch/src/routes/swap_indexes.rs | 7 +- meilisearch/src/routes/tasks.rs | 10 +- 11 files changed, 317 insertions(+), 79 deletions(-) diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index 75db9bb5f..e3851a2df 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -56,7 +56,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::new_v4(); let path = self.path.join(uuid.to_string()); - let update_file = File { file, path }; + let update_file = File { dry: false, file, path }; Ok((uuid, update_file)) } @@ -67,7 +67,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::from_u128(uuid); let path = self.path.join(uuid.to_string()); - let update_file = File { file, path }; + let update_file = File { dry: false, file, path }; Ok((uuid, update_file)) } @@ -135,13 +135,29 @@ impl FileStore { } pub struct File { + dry: bool, path: PathBuf, file: NamedTempFile, } impl File { + pub fn dry_file() -> Result { + #[cfg(target_family = "unix")] + let path = PathBuf::from_str("/dev/null").unwrap(); + #[cfg(target_family = "windows")] + let path = PathBuf::from_str("\\Device\\Null").unwrap(); + + Ok(Self { + dry: true, + path: path.clone(), + file: tempfile::Builder::new().make(|_| std::fs::File::create(path.clone()))?, + }) + } + pub fn persist(self) -> Result<()> { - self.file.persist(&self.path)?; + if !self.dry { + self.file.persist(&self.path)?; + } Ok(()) } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 9a1799469..5d0ce9eb9 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1001,7 +1001,12 @@ impl IndexScheduler { /// Register a new task in the scheduler. /// /// If it fails and data was associated with the task, it tries to delete the associated data. - pub fn register(&self, kind: KindWithContent, task_id: Option) -> Result { + pub fn register( + &self, + kind: KindWithContent, + task_id: Option, + dry_run: bool, + ) -> Result { let mut wtxn = self.env.write_txn()?; // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task @@ -1037,6 +1042,11 @@ impl IndexScheduler { // (that it does not contain duplicate indexes). check_index_swap_validity(&task)?; + // At this point the task is going to be registered and no further checks will be done + if dry_run { + return Ok(task); + } + // Get rid of the mutability. let task = task; @@ -1101,8 +1111,12 @@ impl IndexScheduler { /// The returned file and uuid can be used to associate /// some data to a task. The file will be kept until /// the task has been fully processed. - pub fn create_update_file(&self) -> Result<(Uuid, file_store::File)> { - Ok(self.file_store.new_update()?) + pub fn create_update_file(&self, dry_run: bool) -> Result<(Uuid, file_store::File)> { + if dry_run { + Ok((Uuid::nil(), file_store::File::dry_file()?)) + } else { + Ok(self.file_store.new_update()?) + } } #[cfg(test)] @@ -1413,6 +1427,7 @@ impl IndexScheduler { tasks: to_delete, }, None, + false, )?; Ok(()) @@ -1534,7 +1549,7 @@ impl<'a> Dump<'a> { ) -> Result { let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { - let (uuid, mut file) = self.index_scheduler.create_update_file()?; + let (uuid, mut file) = self.index_scheduler.create_update_file(false)?; let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); for doc in content_file { builder.append_json_object(&doc?)?; @@ -2038,7 +2053,7 @@ mod tests { for (idx, kind) in kinds.into_iter().enumerate() { let k = kind.as_kind(); - let task = index_scheduler.register(kind, None).unwrap(); + let task = index_scheduler.register(kind, None, false).unwrap(); index_scheduler.assert_internally_consistent(); assert_eq!(task.uid, idx as u32); @@ -2053,18 +2068,18 @@ mod tests { fn insert_task_while_another_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated]); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); // while the task is processing can we register another task? - index_scheduler.register(index_creation_task("index_b", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_b", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); } @@ -2073,7 +2088,7 @@ mod tests { fn test_task_is_processing() { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler.register(index_creation_task("index_a", "id"), None).unwrap(); + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); handle.advance_till([Start, BatchCreated]); @@ -2090,6 +2105,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2098,12 +2114,13 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2125,22 +2142,23 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None) + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); @@ -2173,7 +2191,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2188,6 +2206,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0, 1]), }, None, + false, ) .unwrap(); // again, no progress made at all, but one more task is registered @@ -2222,7 +2241,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2239,6 +2258,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); @@ -2262,7 +2282,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); @@ -2280,6 +2300,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2313,6 +2334,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); @@ -2338,6 +2360,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2356,12 +2379,13 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); @@ -2395,6 +2419,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2405,6 +2430,7 @@ mod tests { documents_ids: vec![S("1"), S("2")], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2434,6 +2460,7 @@ mod tests { documents_ids: vec![S("1"), S("2")], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2458,6 +2485,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); @@ -2495,6 +2523,7 @@ mod tests { primary_key: None, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2502,7 +2531,11 @@ mod tests { for name in index_names { index_scheduler - .register(KindWithContent::DocumentClear { index_uid: name.to_string() }, None) + .register( + KindWithContent::DocumentClear { index_uid: name.to_string() }, + None, + false, + ) .unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2527,7 +2560,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2549,6 +2582,7 @@ mod tests { ], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); @@ -2558,6 +2592,7 @@ mod tests { swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); @@ -2568,7 +2603,9 @@ mod tests { handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); - index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }, None).unwrap(); + index_scheduler + .register(KindWithContent::IndexSwap { swaps: vec![] }, None, false) + .unwrap(); handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); } @@ -2585,7 +2622,7 @@ mod tests { ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_n_successful_batches(4); @@ -2603,6 +2640,7 @@ mod tests { ], }, None, + false, ) .unwrap_err(); snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); @@ -2621,6 +2659,7 @@ mod tests { ], }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -2652,10 +2691,11 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None) + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler)); @@ -2680,7 +2720,7 @@ mod tests { }, ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } @@ -2697,7 +2737,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None) + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2711,6 +2751,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); @@ -2726,7 +2767,7 @@ mod tests { file0.persist().unwrap(); let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None) + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -2740,6 +2781,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0]), }, None, + false, ) .unwrap(); @@ -2770,7 +2812,7 @@ mod tests { replace_document_import_task("wolfo", None, 2, documents_count2), ]; for task in to_enqueue { - let _ = index_scheduler.register(task, None).unwrap(); + let _ = index_scheduler.register(task, None, false).unwrap(); index_scheduler.assert_internally_consistent(); } handle.advance_one_successful_batch(); @@ -2784,6 +2826,7 @@ mod tests { tasks: RoaringBitmap::from_iter([0, 1, 2]), }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); @@ -2822,6 +2865,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2872,6 +2916,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2924,6 +2969,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -2977,6 +3023,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3031,6 +3078,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3076,13 +3124,13 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); handle.advance_n_successful_batches(3); @@ -3140,11 +3188,11 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3363,17 +3411,17 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3449,20 +3497,20 @@ mod tests { IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind, None).unwrap(); + let _ = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind, None).unwrap(); + let _ = index_scheduler.register(kind, None, false).unwrap(); let kind = KindWithContent::IndexSwap { swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], }; - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); handle.advance_n_successful_batches(1); let kind = KindWithContent::TaskCancelation { query: "test_query".to_string(), tasks: [0, 1, 2, 3].into_iter().collect(), }; - let task_cancelation = index_scheduler.register(kind, None).unwrap(); + let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); handle.advance_n_successful_batches(1); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); @@ -3497,7 +3545,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); handle.advance_one_failed_batch(); @@ -3532,6 +3580,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3573,6 +3622,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3632,6 +3682,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3683,6 +3734,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3714,6 +3766,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3743,6 +3796,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3779,6 +3833,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3808,6 +3863,7 @@ mod tests { allow_index_creation: false, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3848,6 +3904,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, None, + false, ) .unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); @@ -3878,6 +3935,7 @@ mod tests { allow_index_creation, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3934,6 +3992,7 @@ mod tests { allow_index_creation, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -3989,6 +4048,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4053,6 +4113,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4113,6 +4174,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4197,6 +4259,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4283,6 +4346,7 @@ mod tests { allow_index_creation: true, }, None, + false, ) .unwrap(); index_scheduler.assert_internally_consistent(); @@ -4337,7 +4401,7 @@ mod tests { let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); @@ -4360,6 +4424,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4368,6 +4433,7 @@ mod tests { let result = index_scheduler.register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ); if result.is_err() { break; @@ -4381,6 +4447,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); @@ -4392,6 +4459,7 @@ mod tests { .register( KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, None, + false, ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); @@ -4403,6 +4471,7 @@ mod tests { .register( KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4412,6 +4481,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4428,6 +4498,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4436,6 +4507,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4446,12 +4518,14 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); index_scheduler .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); @@ -4507,6 +4581,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_successful_batch(); @@ -4515,6 +4590,7 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); handle.advance_one_failed_batch(); @@ -4525,12 +4601,14 @@ mod tests { .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); index_scheduler .register( KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, None, + false, ) .unwrap(); @@ -4555,11 +4633,11 @@ mod tests { let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None).unwrap(); + let _task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" { @@ -4709,11 +4787,11 @@ mod tests { query: "cancel dump".to_owned(), tasks: RoaringBitmap::from_iter([0]), }; - let _ = index_scheduler.register(dump_creation, None).unwrap(); + let _ = index_scheduler.register(dump_creation, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - let _ = index_scheduler.register(dump_cancellation, None).unwrap(); + let _ = index_scheduler.register(dump_cancellation, None, false).unwrap(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); @@ -4727,15 +4805,86 @@ mod tests { let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, None).unwrap(); + let task = index_scheduler.register(kind, None, false).unwrap(); snapshot!(task.uid, @"0"); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, Some(12)).unwrap(); + let task = index_scheduler.register(kind, Some(12), false).unwrap(); snapshot!(task.uid, @"12"); let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let error = index_scheduler.register(kind, Some(5)).unwrap_err(); + let error = index_scheduler.register(kind, Some(5), false).unwrap_err(); snapshot!(error, @"Received bad task id: 5 should be >= to 13."); } + + #[test] + fn dry_run() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None, true).unwrap(); + snapshot!(task.uid, @"0"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r###" + ### Autobatching Enabled = true + ### Processing Tasks: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "###); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12), true).unwrap(); + snapshot!(task.uid, @"12"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r###" + ### Autobatching Enabled = true + ### Processing Tasks: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "###); + } } diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 292a87259..7c40059d7 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -265,7 +265,9 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< .name(String::from("register-snapshot-tasks")) .spawn(move || loop { thread::sleep(snapshot_delay); - if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation, None) { + if let Err(e) = + index_scheduler.register(KindWithContent::SnapshotCreation, None, false) + { error!("Error while registering snapshot: {}", e); } }) diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 56231a759..7f3cd06a5 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -11,7 +11,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -32,8 +32,11 @@ pub async fn create_dump( instance_uid: analytics.instance_uid().cloned(), }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create dump"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 5bf7eaa8d..a74bbff49 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -36,7 +36,9 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use crate::routes::{ + get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, +}; use crate::search::parse_filter; use crate::Opt; @@ -133,8 +135,11 @@ pub async fn delete_document( documents_ids: vec![document_id], }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!("returns: {:?}", task); Ok(HttpResponse::Accepted().json(task)) } @@ -282,6 +287,7 @@ pub async fn replace_documents( let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -291,6 +297,7 @@ pub async fn replace_documents( body, IndexDocumentsMethod::ReplaceDocuments, uid, + dry_run, allow_index_creation, ) .await?; @@ -317,6 +324,7 @@ pub async fn update_documents( let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = document_addition( extract_mime_type(&req)?, index_scheduler, @@ -326,6 +334,7 @@ pub async fn update_documents( body, IndexDocumentsMethod::UpdateDocuments, uid, + dry_run, allow_index_creation, ) .await?; @@ -344,6 +353,7 @@ async fn document_addition( mut body: Payload, method: IndexDocumentsMethod, task_id: Option, + dry_run: bool, allow_index_creation: bool, ) -> Result { let format = match ( @@ -376,7 +386,7 @@ async fn document_addition( } }; - let (uuid, mut update_file) = index_scheduler.create_update_file()?; + let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; let temp_file = match tempfile() { Ok(file) => file, @@ -460,7 +470,9 @@ async fn document_addition( }; let scheduler = index_scheduler.clone(); - let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id)).await? { + let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id, dry_run)) + .await? + { Ok(task) => task, Err(e) => { index_scheduler.delete_update_file(uuid)?; @@ -492,8 +504,11 @@ pub async fn delete_documents_batch( let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete documents by batch"); Ok(HttpResponse::Accepted().json(task)) @@ -530,8 +545,11 @@ pub async fn delete_documents_by_filter( let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete documents by filter"); Ok(HttpResponse::Accepted().json(task)) @@ -549,8 +567,11 @@ pub async fn clear_all_documents( let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete all documents"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 59a1f0e64..59fa02dff 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -22,6 +22,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::is_dry_run; use crate::Opt; pub mod documents; @@ -140,8 +141,11 @@ pub async fn create_index( let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create index"); Ok(HttpResponse::Accepted().json(task)) @@ -211,8 +215,11 @@ pub async fn update_index( }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Update index"); Ok(HttpResponse::Accepted().json(task)) @@ -227,8 +234,11 @@ pub async fn delete_index( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete index"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 6e43bce41..c71d83279 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -15,7 +15,7 @@ use tracing::debug; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; #[macro_export] @@ -36,7 +36,7 @@ macro_rules! make_setting_route { use $crate::extractors::authentication::GuardedData; use $crate::extractors::sequential_extractor::SeqHandler; use $crate::Opt; - use $crate::routes::{get_task_id, SummarizedTaskView}; + use $crate::routes::{is_dry_run, get_task_id, SummarizedTaskView}; pub async fn delete( index_scheduler: GuardedData< @@ -61,8 +61,9 @@ macro_rules! make_setting_route { allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) .await?? .into(); @@ -112,8 +113,9 @@ macro_rules! make_setting_route { allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)) + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) .await?? .into(); @@ -776,8 +778,11 @@ pub async fn update_all( allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Update all settings"); Ok(HttpResponse::Accepted().json(task)) @@ -815,8 +820,11 @@ pub async fn delete_all( allow_index_creation, }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Delete all settings"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 2dc89b150..f98d4b4de 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -77,6 +77,25 @@ pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, Respo Ok(task_id) } +pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result { + if !opt.experimental_ha_parameters { + return Ok(false); + } + Ok(req + .headers() + .get("DryRun") + .map(|header| { + header.to_str().map_err(|e| { + ResponseError::from_msg( + format!("DryRun is not a valid utf-8 string: {e}"), + Code::BadRequest, + ) + }) + }) + .transpose()? + .map_or(false, |s| s.to_lowercase() == "true")) +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct SummarizedTaskView { diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index 6b3178126..84673729f 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -10,7 +10,7 @@ use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::routes::{get_task_id, SummarizedTaskView}; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -27,8 +27,11 @@ pub async fn create_snapshot( let task = KindWithContent::SnapshotCreation; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); debug!(returns = ?task, "Create snapshot"); Ok(HttpResponse::Accepted().json(task)) diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index f8adeeb18..51a7b0707 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -10,7 +10,7 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde_json::json; -use super::{get_task_id, SummarizedTaskView}; +use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -63,7 +63,10 @@ pub async fn swap_indexes( let task = KindWithContent::IndexSwap { swaps }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = - tokio::task::spawn_blocking(move || index_scheduler.register(task, uid)).await??.into(); + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); Ok(HttpResponse::Accepted().json(task)) } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 279b57e3d..f35d97fe6 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -18,7 +18,7 @@ use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; -use super::{get_task_id, SummarizedTaskView}; +use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; @@ -200,8 +200,10 @@ async fn cancel_tasks( KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; let task = - task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid)).await??; + task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid, dry_run)) + .await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) @@ -248,7 +250,9 @@ async fn delete_tasks( KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; let uid = get_task_id(&req, &opt)?; - let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid)).await??; + let dry_run = is_dry_run(&req, &opt)?; + let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid, dry_run)) + .await??; let task: SummarizedTaskView = task.into(); Ok(HttpResponse::Ok().json(task)) From 60510e037bab23dde027c694bed10a5380a57f65 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 12:30:28 +0100 Subject: [PATCH 28/52] update the discussion link --- meilisearch/src/option.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 58f3791e8..377507374 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -360,8 +360,7 @@ pub struct Opt { pub experimental_enable_logs_route: bool, /// Enable multiple features that helps you to run meilisearch in a high availability context. - /// TODO: TAMO: Update the discussion link - /// For more information, see: + /// For more information, see: /// /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now /// - Lets you specify a custom task ID upon registering a task From bbf3fb88ca3a9db178403f758201657b5c1d02cb Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 21 Feb 2024 14:33:40 +0100 Subject: [PATCH 29/52] rename the cli parameter --- meilisearch/src/analytics/segment_analytics.rs | 14 +++----------- meilisearch/src/lib.rs | 2 +- meilisearch/src/option.rs | 14 +++++++------- meilisearch/src/routes/mod.rs | 4 ++-- meilisearch/tests/index/create_index.rs | 3 ++- 5 files changed, 15 insertions(+), 22 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 55dd02460..262a4751a 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -253,7 +253,7 @@ struct Infos { env: String, experimental_enable_metrics: bool, experimental_logs_mode: LogMode, - experimental_ha_parameters: bool, + experimental_replication_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, @@ -292,12 +292,8 @@ impl From for Infos { let Opt { db_path, experimental_enable_metrics, -<<<<<<< HEAD experimental_logs_mode, -||||||| parent of 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) -======= - experimental_ha_parameters, ->>>>>>> 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) + experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, @@ -345,12 +341,8 @@ impl From for Infos { Self { env, experimental_enable_metrics, -<<<<<<< HEAD experimental_logs_mode, -||||||| parent of 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) -======= - experimental_ha_parameters, ->>>>>>> 01ae46dd8 (add an experimental cli parameter to allow specifying your task id) + experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, db_path: db_path != PathBuf::from("./data.ms"), diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 7c40059d7..0d892e7e8 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -302,7 +302,7 @@ fn open_or_create_database_unchecked( enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, indexer_config: (&opt.indexer_options).try_into()?, autobatching_enabled: true, - cleanup_enabled: !opt.experimental_ha_parameters, + cleanup_enabled: !opt.experimental_replication_parameters, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize, diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 377507374..27f2d9c41 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -52,7 +52,7 @@ const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE"; -const MEILI_EXPERIMENTAL_HA_PARAMETERS: &str = "MEILI_EXPERIMENTAL_HA_PARAMETERS"; +const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = @@ -359,15 +359,15 @@ pub struct Opt { #[serde(default)] pub experimental_enable_logs_route: bool, - /// Enable multiple features that helps you to run meilisearch in a high availability context. + /// Enable multiple features that helps you to run meilisearch in a replicated context. /// For more information, see: /// /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now /// - Lets you specify a custom task ID upon registering a task /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed) - #[clap(long, env = MEILI_EXPERIMENTAL_HA_PARAMETERS)] + #[clap(long, env = MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS)] #[serde(default)] - pub experimental_ha_parameters: bool, + pub experimental_replication_parameters: bool, /// Experimental RAM reduction during indexing, do not use in production, see: #[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)] @@ -476,7 +476,7 @@ impl Opt { experimental_enable_metrics, experimental_logs_mode, experimental_enable_logs_route, - experimental_ha_parameters, + experimental_replication_parameters, experimental_reduce_indexing_memory_usage, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); @@ -538,8 +538,8 @@ impl Opt { experimental_logs_mode.to_string(), ); export_to_env_if_not_present( - MEILI_EXPERIMENTAL_HA_PARAMETERS, - experimental_ha_parameters.to_string(), + MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS, + experimental_replication_parameters.to_string(), ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE, diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index f98d4b4de..249103e12 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -47,7 +47,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { } pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, ResponseError> { - if !opt.experimental_ha_parameters { + if !opt.experimental_replication_parameters { return Ok(None); } let task_id = req @@ -78,7 +78,7 @@ pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, Respo } pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result { - if !opt.experimental_ha_parameters { + if !opt.experimental_replication_parameters { return Ok(false); } Ok(req diff --git a/meilisearch/tests/index/create_index.rs b/meilisearch/tests/index/create_index.rs index 7a678624c..b309b83c6 100644 --- a/meilisearch/tests/index/create_index.rs +++ b/meilisearch/tests/index/create_index.rs @@ -205,7 +205,8 @@ async fn error_create_with_invalid_index_uid() { async fn send_task_id() { let temp = tempfile::tempdir().unwrap(); - let options = Opt { experimental_ha_parameters: true, ..default_settings(temp.path()) }; + let options = + Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) }; let server = Server::new_with_options(options).await.unwrap(); let app = server.init_web_app().await; From a478392b7a5b200d2ce6847a8173b1984e6bc955 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 22 Feb 2024 15:51:47 +0100 Subject: [PATCH 30/52] create a test with the dry-run parameter enabled --- meilisearch/tests/common/index.rs | 9 +--- meilisearch/tests/documents/add_documents.rs | 49 +++++++++++++++++++- meilisearch/tests/documents/errors.rs | 41 ++++++++++------ 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 4992eeb13..16fc10e98 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -100,16 +100,11 @@ impl Index<'_> { pub async fn raw_add_documents( &self, payload: &str, - content_type: Option<&str>, + headers: Vec<(&str, &str)>, query_parameter: &str, ) -> (Value, StatusCode) { let url = format!("/indexes/{}/documents{}", urlencode(self.uid.as_ref()), query_parameter); - - if let Some(content_type) = content_type { - self.service.post_str(url, payload, vec![("Content-Type", content_type)]).await - } else { - self.service.post_str(url, payload, Vec::new()).await - } + self.service.post_str(url, payload, headers).await } pub async fn update_documents( diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index 9733f7741..e6af85229 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -1,10 +1,11 @@ use actix_web::test; use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, Server, Value}; +use crate::common::{default_settings, GetAllDocumentsOptions, Server, Value}; use crate::json; /// This is the basic usage of our API and every other tests uses the content-type application/json @@ -2157,3 +2158,49 @@ async fn batch_several_documents_addition() { assert_eq!(code, 200, "failed with `{}`", response); assert_eq!(response["results"].as_array().unwrap().len(), 120); } + +#[actix_rt::test] +async fn dry_register_file() { + let temp = tempfile::tempdir().unwrap(); + + let options = + Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) }; + let server = Server::new_with_options(options).await.unwrap(); + let index = server.index("tamo"); + + let documents = r#" + { + "id": "12", + "doggo": "kefir" + } + "#; + + let (response, code) = index + .raw_add_documents( + documents, + vec![("Content-Type", "application/json"), ("DryRun", "true")], + "", + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "tamo", + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "[date]" + } + "###); + snapshot!(code, @"202 Accepted"); + + let (response, code) = index.get_task(response.uid()).await; + snapshot!(response, @r###" + { + "message": "Task `0` not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#task_not_found" + } + "###); + snapshot!(code, @"404 Not Found"); +} diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index bd06aabce..cd2d89813 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -209,7 +209,8 @@ async fn replace_documents_missing_payload() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", Some("application/json"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "application/json")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -220,7 +221,8 @@ async fn replace_documents_missing_payload() { } "###); - let (response, code) = index.raw_add_documents("", Some("application/x-ndjson"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -231,7 +233,8 @@ async fn replace_documents_missing_payload() { } "###); - let (response, code) = index.raw_add_documents("", Some("text/csv"), "").await; + let (response, code) = + index.raw_add_documents("", vec![("Content-Type", "text/csv")], "").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -287,7 +290,7 @@ async fn replace_documents_missing_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", None, "").await; + let (response, code) = index.raw_add_documents("", Vec::new(), "").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -299,7 +302,7 @@ async fn replace_documents_missing_content_type() { "###); // even with a csv delimiter specified this error is triggered first - let (response, code) = index.raw_add_documents("", None, "?csvDelimiter=;").await; + let (response, code) = index.raw_add_documents("", Vec::new(), "?csvDelimiter=;").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -345,7 +348,7 @@ async fn replace_documents_bad_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = index.raw_add_documents("", Some("doggo"), "").await; + let (response, code) = index.raw_add_documents("", vec![("Content-Type", "doggo")], "").await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -379,8 +382,9 @@ async fn replace_documents_bad_csv_delimiter() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter") + .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -391,8 +395,9 @@ async fn replace_documents_bad_csv_delimiter() { } "###); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter=doggo").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=doggo") + .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -404,7 +409,11 @@ async fn replace_documents_bad_csv_delimiter() { "###); let (response, code) = index - .raw_add_documents("", Some("application/json"), &format!("?csvDelimiter={}", encode("🍰"))) + .raw_add_documents( + "", + vec![("Content-Type", "application/json")], + &format!("?csvDelimiter={}", encode("🍰")), + ) .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" @@ -469,8 +478,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() { let server = Server::new().await; let index = server.index("test"); - let (response, code) = - index.raw_add_documents("", Some("application/json"), "?csvDelimiter=a").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=a") + .await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { @@ -481,8 +491,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() { } "###); - let (response, code) = - index.raw_add_documents("", Some("application/x-ndjson"), "?csvDelimiter=a").await; + let (response, code) = index + .raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "?csvDelimiter=a") + .await; snapshot!(code, @"415 Unsupported Media Type"); snapshot!(json_string!(response), @r###" { From 0562818c2a1380d9d87cd5ca1a37bdc7b1bb8748 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 22 Feb 2024 18:42:12 +0100 Subject: [PATCH 31/52] fix and remove the file-store hack of /dev/null --- file-store/src/lib.rs | 56 +++++++++------------ index-scheduler/src/lib.rs | 54 ++++++++++---------- meilisearch-types/src/document_formats.rs | 16 +++--- meilisearch/src/routes/indexes/documents.rs | 8 ++- 4 files changed, 63 insertions(+), 71 deletions(-) diff --git a/file-store/src/lib.rs b/file-store/src/lib.rs index e3851a2df..15c4168bc 100644 --- a/file-store/src/lib.rs +++ b/file-store/src/lib.rs @@ -1,5 +1,5 @@ use std::fs::File as StdFile; -use std::ops::{Deref, DerefMut}; +use std::io::Write; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -22,20 +22,6 @@ pub enum Error { pub type Result = std::result::Result; -impl Deref for File { - type Target = NamedTempFile; - - fn deref(&self) -> &Self::Target { - &self.file - } -} - -impl DerefMut for File { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.file - } -} - #[derive(Clone, Debug)] pub struct FileStore { path: PathBuf, @@ -56,7 +42,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::new_v4(); let path = self.path.join(uuid.to_string()); - let update_file = File { dry: false, file, path }; + let update_file = File { file: Some(file), path }; Ok((uuid, update_file)) } @@ -67,7 +53,7 @@ impl FileStore { let file = NamedTempFile::new_in(&self.path)?; let uuid = Uuid::from_u128(uuid); let path = self.path.join(uuid.to_string()); - let update_file = File { dry: false, file, path }; + let update_file = File { file: Some(file), path }; Ok((uuid, update_file)) } @@ -135,33 +121,41 @@ impl FileStore { } pub struct File { - dry: bool, path: PathBuf, - file: NamedTempFile, + file: Option, } impl File { pub fn dry_file() -> Result { - #[cfg(target_family = "unix")] - let path = PathBuf::from_str("/dev/null").unwrap(); - #[cfg(target_family = "windows")] - let path = PathBuf::from_str("\\Device\\Null").unwrap(); - - Ok(Self { - dry: true, - path: path.clone(), - file: tempfile::Builder::new().make(|_| std::fs::File::create(path.clone()))?, - }) + Ok(Self { path: PathBuf::new(), file: None }) } pub fn persist(self) -> Result<()> { - if !self.dry { - self.file.persist(&self.path)?; + if let Some(file) = self.file { + file.persist(&self.path)?; } Ok(()) } } +impl Write for File { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if let Some(file) = self.file.as_mut() { + file.write(buf) + } else { + Ok(buf.len()) + } + } + + fn flush(&mut self) -> std::io::Result<()> { + if let Some(file) = self.file.as_mut() { + file.flush() + } else { + Ok(()) + } + } +} + #[cfg(test)] mod test { use std::io::Write; diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 5d0ce9eb9..1c3b93bce 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1550,7 +1550,7 @@ impl<'a> Dump<'a> { let content_uuid = match content_file { Some(content_file) if task.status == Status::Enqueued => { let (uuid, mut file) = self.index_scheduler.create_update_file(false)?; - let mut builder = DocumentsBatchBuilder::new(file.as_file_mut()); + let mut builder = DocumentsBatchBuilder::new(&mut file); for doc in content_file { builder.append_json_object(&doc?)?; } @@ -1734,7 +1734,7 @@ pub struct IndexStats { #[cfg(test)] mod tests { - use std::io::{BufWriter, Seek, Write}; + use std::io::{BufWriter, Write}; use std::time::Instant; use big_s::S; @@ -1882,7 +1882,7 @@ mod tests { /// Adapting to the new json reading interface pub fn read_json( bytes: &[u8], - write: impl Write + Seek, + write: impl Write, ) -> std::result::Result { let temp_file = NamedTempFile::new().unwrap(); let mut buffer = BufWriter::new(temp_file.reopen().unwrap()); @@ -1909,7 +1909,7 @@ mod tests { ); let (_uuid, mut file) = index_scheduler.create_update_file_with_uuid(file_uuid).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); (file, documents_count) } @@ -2321,7 +2321,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2366,7 +2366,7 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2406,7 +2406,7 @@ mod tests { ]"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2472,7 +2472,7 @@ mod tests { ]"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2678,7 +2678,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2852,7 +2852,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2903,7 +2903,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -2956,7 +2956,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3010,7 +3010,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3065,7 +3065,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3567,7 +3567,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3609,7 +3609,7 @@ mod tests { }"#; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3669,7 +3669,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3721,7 +3721,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3783,7 +3783,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3850,7 +3850,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3922,7 +3922,7 @@ mod tests { let allow_index_creation = i % 2 != 0; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -3979,7 +3979,7 @@ mod tests { let allow_index_creation = i % 2 != 0; let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); file.persist().unwrap(); index_scheduler .register( @@ -4033,7 +4033,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4098,7 +4098,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4159,7 +4159,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4244,7 +4244,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); @@ -4331,7 +4331,7 @@ mod tests { ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); assert_eq!(documents_count, 1); file.persist().unwrap(); diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs index 0f1d995f9..50dc5bad4 100644 --- a/meilisearch-types/src/document_formats.rs +++ b/meilisearch-types/src/document_formats.rs @@ -1,6 +1,6 @@ use std::fmt::{self, Debug, Display}; use std::fs::File; -use std::io::{self, Seek, Write}; +use std::io::{self, BufWriter, Write}; use std::marker::PhantomData; use memmap2::MmapOptions; @@ -104,8 +104,8 @@ impl ErrorCode for DocumentFormatError { } /// Reads CSV from input and write an obkv batch to writer. -pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref()); builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?; @@ -116,9 +116,9 @@ pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result Ok(count as u64) } -/// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_json(file: &File, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +/// Reads JSON from temporary file and write an obkv batch to writer. +pub fn read_json(file: &File, writer: impl Write) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; let mut deserializer = serde_json::Deserializer::from_slice(&mmap); @@ -151,8 +151,8 @@ pub fn read_json(file: &File, writer: impl Write + Seek) -> Result { } /// Reads JSON from temporary file and write an obkv batch to writer. -pub fn read_ndjson(file: &File, writer: impl Write + Seek) -> Result { - let mut builder = DocumentsBatchBuilder::new(writer); +pub fn read_ndjson(file: &File, writer: impl Write) -> Result { + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer)); let mmap = unsafe { MmapOptions::new().map(file)? }; for result in serde_json::Deserializer::from_slice(&mmap).into_iter() { diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index a74bbff49..43fab1dae 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -425,11 +425,9 @@ async fn document_addition( let read_file = buffer.into_inner().into_std().await; let documents_count = tokio::task::spawn_blocking(move || { let documents_count = match format { - PayloadType::Json => read_json(&read_file, update_file.as_file_mut())?, - PayloadType::Csv { delimiter } => { - read_csv(&read_file, update_file.as_file_mut(), delimiter)? - } - PayloadType::Ndjson => read_ndjson(&read_file, update_file.as_file_mut())?, + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, + PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, }; // we NEED to persist the file here because we moved the `udpate_file` in another task. update_file.persist()?; From 5e83bac448dabea20dc9d43eb161d07de0f15d3a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 26 Feb 2024 15:40:15 +0100 Subject: [PATCH 32/52] Fix PR comments --- milli/src/update/facet/incremental.rs | 59 +++++++++++++++------------ 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 584870a7a..798e0fe3d 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -18,6 +18,32 @@ use crate::update::index_documents::valid_lmdb_key; use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; +/// Enum used as a return value for the facet incremental indexing. +/// +/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have +/// an effect on the number of keys in that level. Therefore, it did not increase the number of children +/// of the parent node. +/// +/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted +/// in the addition of a new key in that level, and that therefore the number of children +/// of the parent node should be incremented. +/// +/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the +/// number of keys in the level. For example, removing a document id from the facet value `3` could +/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted +/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must +/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. +/// +/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the +/// bounds of the keys of the level. For example, removing a document id from the facet value +/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, +/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). +/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust +/// its left bound as well. +/// +/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. +/// This case is reachable when a document id is removed from a sub-level node but is still present in another one. +/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. enum ModificationResult { InPlace, Expand, @@ -315,6 +341,9 @@ impl FacetsUpdateIncrementalInner { Ok(ModificationResult::Insert) } + /// Remove the docids still present in the related sub-level nodes from the del_docids. + /// + /// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes. fn trim_del_docids<'a>( &self, txn: &mut RwTxn, @@ -352,30 +381,6 @@ impl FacetsUpdateIncrementalInner { /// ## Return /// Returns the effect of modifying the facet value to the database on the given `level`. /// - /// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have - /// an effect on the number of keys in that level. Therefore, it did not increase the number of children - /// of the parent node. - /// - /// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted - /// in the addition of a new key in that level, and that therefore the number of children - /// of the parent node should be incremented. - /// - /// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the - /// number of keys in the level. For example, removing a document id from the facet value `3` could - /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted - /// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must - /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. - /// - /// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the - /// bounds of the keys of the level. For example, removing a document id from the facet value - /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, - /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). - /// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust - /// its left bound as well. - /// - /// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. - /// This case is reachable when a document id is removed from a sub-level node but is still present in another one. - /// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. fn modify_in_level( &self, txn: &mut RwTxn, @@ -465,7 +470,7 @@ impl FacetsUpdateIncrementalInner { if updated_value.size < self.max_group_size { // If there are docids to delete, trim them avoiding unexpected removal. - let del_docids = del_docids + if let Some(del_docids) = del_docids .map(|ids| { self.trim_del_docids( txn, @@ -477,8 +482,8 @@ impl FacetsUpdateIncrementalInner { ) }) .transpose()? - .filter(|ids| !ids.is_empty()); - if let Some(del_docids) = del_docids { + .filter(|ids| !ids.is_empty()) + { updated_value.bitmap -= &*del_docids; insertion_value_was_modified = true; } From 716ffc07eec12b97c49f1be130b870756c6350f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 26 Feb 2024 22:15:57 +0100 Subject: [PATCH 33/52] Build the embedders when importing a dump --- meilisearch/src/lib.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 0d892e7e8..9d9274b9d 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -426,6 +426,9 @@ fn import_dump( let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; + let embedder_configs = index.embedding_configs(&wtxn)?; + let embedders = index_scheduler.embedders(embedder_configs)?; + let builder = milli::update::IndexDocuments::new( &mut wtxn, &index, @@ -438,6 +441,8 @@ fn import_dump( || false, )?; + let builder = builder.with_embedders(embedders); + let (builder, user_result) = builder.add_documents(reader)?; let user_result = user_result?; tracing::info!(documents_found = user_result, "{} documents found.", user_result); From 452a343a2b30855c881203e5fb258639f5215127 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 28 Feb 2024 17:53:54 +0100 Subject: [PATCH 34/52] Fix imports --- dump/src/reader/compat/v2_to_v3.rs | 1 - index-scheduler/src/uuid_codec.rs | 1 - meilisearch-auth/src/store.rs | 1 - meilisearch/src/option.rs | 1 - meilisearch/src/routes/api_key.rs | 2 +- meilitool/src/uuid_codec.rs | 1 - milli/src/search/new/query_term/compute_derivations.rs | 5 +++-- milli/src/search/new/query_term/mod.rs | 1 - milli/src/search/new/query_term/parse_query.rs | 8 ++++++-- 9 files changed, 10 insertions(+), 11 deletions(-) diff --git a/dump/src/reader/compat/v2_to_v3.rs b/dump/src/reader/compat/v2_to_v3.rs index 1d4238290..82a3b9e84 100644 --- a/dump/src/reader/compat/v2_to_v3.rs +++ b/dump/src/reader/compat/v2_to_v3.rs @@ -1,4 +1,3 @@ -use std::convert::TryInto; use std::str::FromStr; use time::OffsetDateTime; diff --git a/index-scheduler/src/uuid_codec.rs b/index-scheduler/src/uuid_codec.rs index 54020fa3c..92dc70b0c 100644 --- a/index-scheduler/src/uuid_codec.rs +++ b/index-scheduler/src/uuid_codec.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::convert::TryInto; use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; diff --git a/meilisearch-auth/src/store.rs b/meilisearch-auth/src/store.rs index 276c035b0..1eebd3fe9 100644 --- a/meilisearch-auth/src/store.rs +++ b/meilisearch-auth/src/store.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::cmp::Reverse; use std::collections::HashSet; -use std::convert::{TryFrom, TryInto}; use std::fs::create_dir_all; use std::path::Path; use std::result::Result as StdResult; diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 27f2d9c41..92d53fd32 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -1,4 +1,3 @@ -use std::convert::TryFrom; use std::env::VarError; use std::ffi::OsStr; use std::fmt::Display; diff --git a/meilisearch/src/routes/api_key.rs b/meilisearch/src/routes/api_key.rs index 597d04486..0bd4b9d59 100644 --- a/meilisearch/src/routes/api_key.rs +++ b/meilisearch/src/routes/api_key.rs @@ -10,7 +10,7 @@ use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; -use meilisearch_types::keys::{Action, CreateApiKey, Key, PatchApiKey}; +use meilisearch_types::keys::{CreateApiKey, Key, PatchApiKey}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use uuid::Uuid; diff --git a/meilitool/src/uuid_codec.rs b/meilitool/src/uuid_codec.rs index 54020fa3c..92dc70b0c 100644 --- a/meilitool/src/uuid_codec.rs +++ b/meilitool/src/uuid_codec.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::convert::TryInto; use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode}; use uuid::Uuid; diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index d5dfbbcd0..02754929a 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -6,9 +6,10 @@ use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; -use super::*; +use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::{Lazy, TwoTypoTerm}; use crate::search::new::{limits, SearchContext}; use crate::search::{build_dfa, get_first}; use crate::{Result, MAX_WORD_LENGTH}; diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 6760c8be7..a37e60ed0 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -7,7 +7,6 @@ use std::collections::BTreeSet; use std::iter::FromIterator; use std::ops::RangeInclusive; -use compute_derivations::partially_initialized_term_from_word; use either::Either; pub use ntypo_subset::NTypoTermSubset; pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 865075d97..8ab93ed3b 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -1,7 +1,11 @@ +use std::collections::BTreeSet; + use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; -use super::*; +use super::compute_derivations::partially_initialized_term_from_word; +use super::{LocatedQueryTerm, ZeroTypoTerm}; +use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. @@ -225,7 +229,7 @@ pub fn make_ngram( } struct PhraseBuilder { - words: Vec>>, + words: Vec>>, start: u16, end: u16, } From 9806a3e5f6f7dd3bd1478a1315f13aa429e449a2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 29 Feb 2024 14:24:50 +0100 Subject: [PATCH 35/52] Don't test on nightly --- .github/workflows/test-suite.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 18e9fc48a..5dbde4301 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -31,17 +31,10 @@ jobs: apt-get update && apt-get install -y curl apt-get install build-essential -y - name: Setup test with Rust stable - if: github.event_name != 'schedule' uses: actions-rs/toolchain@v1 with: toolchain: stable override: true - - name: Setup test with Rust nightly - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly - override: true - name: Cache dependencies uses: Swatinem/rust-cache@v2.7.1 - name: Run cargo check without any default features From eada6de261913b449e4dca643663ba04b821e031 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 4 Mar 2024 18:02:54 +0100 Subject: [PATCH 36/52] Divide threshold by ten --- milli/src/update/facet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 15a646836..0af64c4c5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -149,7 +149,7 @@ impl<'i> FacetsUpdate<'i> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // See self::comparison_bench::benchmark_facet_indexing - if self.data_size >= (self.database.len(wtxn)? / 50) { + if self.data_size >= (self.database.len(wtxn)? / 500) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( From f75c7ac979f45c3f6b40b21c00944bca702200f8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 4 Mar 2024 14:29:31 +0100 Subject: [PATCH 37/52] Compile xtask in --release --- .cargo/config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 35049cbcb..e11d56a31 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,2 @@ [alias] -xtask = "run --package xtask --" +xtask = "run --release --package xtask --" From 6862caef648b046c23439ca05bf9a494e6d0fd37 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 26 Feb 2024 16:37:35 +0100 Subject: [PATCH 38/52] Span Stats compute self-time --- tracing-trace/src/processor/span_stats.rs | 92 +++++++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/tracing-trace/src/processor/span_stats.rs b/tracing-trace/src/processor/span_stats.rs index f3e6238ff..584fe53f8 100644 --- a/tracing-trace/src/processor/span_stats.rs +++ b/tracing-trace/src/processor/span_stats.rs @@ -1,4 +1,5 @@ use std::collections::{BTreeMap, HashMap}; +use std::ops::Range; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -16,6 +17,51 @@ enum SpanStatus { pub struct CallStats { pub call_count: usize, pub time: u64, + pub self_time: u64, +} + +#[derive(Debug, Default)] +pub struct SelfTime { + child_ranges: Vec>, +} + +impl SelfTime { + pub fn new() -> Self { + Default::default() + } + + pub fn add_child_range(&mut self, child_range: Range) { + self.child_ranges.push(child_range) + } + + pub fn self_duration(&mut self, self_range: Range) -> Duration { + if self.child_ranges.is_empty() { + return self_range.end - self_range.start; + } + + // by sorting child ranges by their start time, + // we make sure that no child will start before the last one we visited. + self.child_ranges + .sort_by(|left, right| left.start.cmp(&right.start).then(left.end.cmp(&right.end))); + // self duration computed by adding all the segments where the span is not executing a child + let mut self_duration = Duration::from_nanos(0); + + // last point in time where we are certain that this span was not executing a child. + let mut committed_point = self_range.start; + + for child_range in &self.child_ranges { + if child_range.start > committed_point { + // we add to the self duration the point between the end of the latest span and the beginning of the next span + self_duration += child_range.start - committed_point; + } + if committed_point < child_range.end { + // then we set ourselves to the end of the latest span + committed_point = child_range.end; + } + } + + self_duration + } } pub fn to_call_stats( @@ -23,6 +69,9 @@ pub fn to_call_stats( ) -> Result, Error> { let mut calls = HashMap::new(); let mut spans = HashMap::new(); + let mut last_point = Duration::from_nanos(0); + let mut first_point = None; + let mut total_self_time = SelfTime::new(); for entry in trace { let entry = entry?; match entry { @@ -31,10 +80,11 @@ pub fn to_call_stats( } Entry::NewThread(_) => {} Entry::NewSpan(span) => { - spans.insert(span.id, (span, SpanStatus::Outside)); + spans.insert(span.id, (span, SpanStatus::Outside, SelfTime::new())); } Entry::SpanEnter(SpanEnter { id, time, memory: _ }) => { - let (_, status) = spans.get_mut(&id).unwrap(); + first_point.get_or_insert(time); + let (_, status, _) = spans.get_mut(&id).unwrap(); let SpanStatus::Outside = status else { continue; @@ -43,18 +93,32 @@ pub fn to_call_stats( *status = SpanStatus::Inside(time); } Entry::SpanExit(SpanExit { id, time: end, memory: _ }) => { - let (span, status) = spans.get_mut(&id).unwrap(); + let (span, status, self_time) = spans.get_mut(&id).unwrap(); let SpanStatus::Inside(begin) = status else { continue; }; let begin = *begin; + if last_point < end { + last_point = end; + } + *status = SpanStatus::Outside; + let self_range = begin..end; + + let self_duration = self_time.self_duration(self_range.clone()); + *self_time = SelfTime::new(); + let span = *span; + if let Some(parent_id) = span.parent_id { + let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap(); + parent_self_time.add_child_range(self_range.clone()) + } + total_self_time.add_child_range(self_range); let (_, call_list) = calls.get_mut(&span.call_id).unwrap(); - call_list.push(end - begin); + call_list.push((end - begin, self_duration)); } Entry::SpanClose(SpanClose { id, time: _ }) => { spans.remove(&id); @@ -63,17 +127,31 @@ pub fn to_call_stats( } } + let total_self_time = first_point + .map(|first_point| (first_point, total_self_time.self_duration(first_point..last_point))); + Ok(calls .into_iter() .map(|(_, (call_site, calls))| (site_to_string(call_site), calls_to_stats(calls))) + .chain(total_self_time.map(|(first_point, total_self_time)| { + ( + "::meta::total".to_string(), + CallStats { + call_count: 1, + time: (last_point - first_point).as_nanos() as u64, + self_time: total_self_time.as_nanos() as u64, + }, + ) + })) .collect()) } fn site_to_string(call_site: NewCallsite) -> String { format!("{}::{}", call_site.target, call_site.name) } -fn calls_to_stats(calls: Vec) -> CallStats { +fn calls_to_stats(calls: Vec<(Duration, Duration)>) -> CallStats { let nb = calls.len(); - let sum: Duration = calls.iter().sum(); - CallStats { call_count: nb, time: sum.as_nanos() as u64 } + let sum: Duration = calls.iter().map(|(total, _)| total).sum(); + let self_sum: Duration = calls.iter().map(|(_, self_duration)| self_duration).sum(); + CallStats { call_count: nb, time: sum.as_nanos() as u64, self_time: self_sum.as_nanos() as u64 } } From b11df7ec341ba5987e2be644a008c6b5937666e9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 26 Feb 2024 16:38:17 +0100 Subject: [PATCH 39/52] Meilisearch: fix some wrong spans --- milli/src/update/index_documents/extract/mod.rs | 3 +-- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 251a2db99..43f3f4947 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -210,8 +210,7 @@ fn run_extraction_task( let current_span = tracing::Span::current(); rayon::spawn(move || { - let child_span = - tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks"); + let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); let _entered = child_span.enter(); puffin::profile_scope!("extract_multiple_chunks", name); match extract_fn(chunk, indexer) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 61ca1a024..7499b68e5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -284,7 +284,7 @@ where #[tracing::instrument( level = "trace", skip_all, - target = "profile::indexing::details", + target = "indexing::details", name = "index_documents_raw" )] pub fn execute_raw(self, output: TransformOutput) -> Result diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 1fea9a70f..6aad290e5 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -473,7 +473,7 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetIsEmptyDocids(_) => { - let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids"); + let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids"); let _entered = span.enter(); let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn); From 86ce843f3d74d9acc02552af17903b6dc3fd4fc1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 26 Feb 2024 21:29:20 +0100 Subject: [PATCH 40/52] Add cargo xtask bench --- Cargo.lock | 57 +- xtask/Cargo.toml | 25 + xtask/src/bench/env_info.rs | 129 ++++ xtask/src/bench/mod.rs | 1159 +++++++++++++++++++++++++++++++++++ xtask/src/lib.rs | 1 + xtask/src/main.rs | 6 +- 6 files changed, 1370 insertions(+), 7 deletions(-) create mode 100644 xtask/src/bench/env_info.rs create mode 100644 xtask/src/bench/mod.rs create mode 100644 xtask/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 971ab602a..eca3b2fbc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -440,6 +440,12 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "atomic" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" + [[package]] name = "atomic-polyfill" version = "0.1.11" @@ -3488,6 +3494,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.45" @@ -4218,10 +4230,12 @@ dependencies = [ "system-configuration", "tokio", "tokio-rustls 0.24.1", + "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots 0.25.3", "winreg", @@ -4934,12 +4948,13 @@ dependencies = [ [[package]] name = "time" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" +checksum = "fe80ced77cbfb4cb91a94bf72b378b4b6791a0d9b7f09d0be747d1bdff4e68bd" dependencies = [ "deranged", "itoa", + "num-conv", "powerfmt", "serde", "time-core", @@ -4954,10 +4969,11 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" +checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" dependencies = [ + "num-conv", "time-core", ] @@ -5395,10 +5411,11 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" dependencies = [ + "atomic", "getrandom", "serde", ] @@ -5539,6 +5556,19 @@ version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wav" version = "1.0.0" @@ -5873,8 +5903,23 @@ dependencies = [ name = "xtask" version = "1.7.0" dependencies = [ + "anyhow", "cargo_metadata", "clap", + "futures-core", + "futures-util", + "git2", + "reqwest", + "serde", + "serde_json", + "sha2", + "sysinfo", + "time", + "tokio", + "tracing", + "tracing-subscriber", + "tracing-trace", + "uuid", ] [[package]] diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 07271ea09..a59a79e53 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -11,5 +11,30 @@ license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.79" cargo_metadata = "0.18.1" clap = { version = "4.4.14", features = ["derive"] } +futures-core = "0.3.30" +futures-util = "0.3.30" +git2 = { version = "0.16", default_features = false } +reqwest = { version = "0.11.23", features = [ + "stream", + "json", + "rustls-tls", +], default_features = false } +serde = { version = "1.0.195", features = ["derive"] } +serde_json = "1.0.111" +sha2 = "0.10.8" +sysinfo = "0.30.5" +time = { version = "0.3.32", features = ["serde", "serde-human-readable"] } +tokio = { version = "1.35.1", features = [ + "rt", + "net", + "time", + "process", + "signal", +] } +tracing = "0.1.40" +tracing-subscriber = "0.3.18" +tracing-trace = { version = "0.1.0", path = "../tracing-trace" } +uuid = { version = "1.7.0", features = ["v7", "serde"] } diff --git a/xtask/src/bench/env_info.rs b/xtask/src/bench/env_info.rs new file mode 100644 index 000000000..5cbeb4274 --- /dev/null +++ b/xtask/src/bench/env_info.rs @@ -0,0 +1,129 @@ +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Source { + pub repo_url: Option, + pub branch_or_tag: String, + pub commit_id: String, + pub commit_msg: String, + pub author_name: String, + pub author_email: String, + pub committer_name: String, + pub committer_email: String, +} + +impl Source { + pub fn from_repo( + path: impl AsRef, + ) -> Result<(Self, OffsetDateTime), git2::Error> { + use git2::Repository; + + let repo = Repository::open(path)?; + let remote = repo.remotes()?; + let remote = remote.get(0).expect("No remote associated to the repo"); + let remote = repo.find_remote(remote)?; + + let head = repo.head()?; + + let commit = head.peel_to_commit()?; + + let time = OffsetDateTime::from_unix_timestamp(commit.time().seconds()).unwrap(); + + let author = commit.author(); + let committer = commit.committer(); + + Ok(( + Self { + repo_url: remote.url().map(|s| s.to_string()), + branch_or_tag: head.name().unwrap().to_string(), + commit_id: commit.id().to_string(), + commit_msg: String::from_utf8_lossy(commit.message_bytes()) + .to_string() + .lines() + .next() + .map_or(String::new(), |s| s.to_string()), + author_name: author.name().unwrap().to_string(), + author_email: author.email().unwrap().to_string(), + committer_name: committer.name().unwrap().to_string(), + committer_email: committer.email().unwrap().to_string(), + }, + time, + )) + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Environment { + pub hostname: Option, + pub cpu: String, + + /// Advertised or nominal clock speed in Hertz. + pub clock_speed: u64, + + /// Total number of bytes of memory provided by the system. */ + pub memory: u64, + pub os_type: String, + pub software: Vec, + + pub user_name: String, + + /// Is set true when the data was gathered by a manual run, + /// possibly on a developer machine, instead of the usual benchmark server. + pub manual_run: bool, +} + +impl Environment { + pub fn generate_from_current_config() -> Self { + use sysinfo::System; + + let unknown_string = String::from("Unknown"); + let mut system = System::new(); + system.refresh_cpu(); + system.refresh_cpu_frequency(); + system.refresh_memory(); + + let (cpu, frequency) = match system.cpus().first() { + Some(cpu) => ( + format!("{} @ {:.2}GHz", cpu.brand(), cpu.frequency() as f64 / 1000.0), + cpu.frequency() * 1_000_000, + ), + None => (unknown_string.clone(), 0), + }; + + let mut software = Vec::new(); + if let Some(distribution) = System::name() { + software + .push(VersionInfo { name: distribution, version: String::from("distribution") }); + } + if let Some(kernel) = System::kernel_version() { + software.push(VersionInfo { name: kernel, version: String::from("kernel") }); + } + if let Some(os) = System::os_version() { + software.push(VersionInfo { name: os, version: String::from("kernel-release") }); + } + if let Some(arch) = System::cpu_arch() { + software.push(VersionInfo { name: arch, version: String::from("arch") }); + } + + Self { + hostname: System::host_name(), + cpu, + clock_speed: frequency, + memory: system.total_memory(), + os_type: System::long_os_version().unwrap_or(unknown_string.clone()), + user_name: System::name().unwrap_or(unknown_string.clone()), + manual_run: false, + software, + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct VersionInfo { + pub name: String, + pub version: String, +} diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs new file mode 100644 index 000000000..ea17b6f69 --- /dev/null +++ b/xtask/src/bench/mod.rs @@ -0,0 +1,1159 @@ +mod env_info; + +use std::collections::BTreeMap; +use std::fmt::Display; +use std::io::{Read, Seek, Write}; +use std::path::PathBuf; + +use anyhow::{bail, Context}; +use clap::Parser; +use futures_util::TryStreamExt; +use serde::Deserialize; +use serde_json::json; +use sha2::Digest; +use tracing_subscriber::fmt::format::FmtSpan; +use tracing_subscriber::layer::SubscriberExt; +use tracing_subscriber::Layer; +use uuid::Uuid; + +pub fn default_http_addr() -> String { + "127.0.0.1:7700".to_string() +} +pub fn default_report_folder() -> String { + "./bench/reports/".into() +} + +pub fn default_asset_folder() -> String { + "./bench/assets/".into() +} + +pub fn default_log_filter() -> String { + "info".into() +} + +pub fn default_dashboard_url() -> String { + "http://localhost:9001".into() +} + +#[derive(Debug, Clone)] +pub struct Client { + base_url: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new( + base_url: Option, + api_key: Option<&str>, + timeout: Option, + ) -> anyhow::Result { + let mut headers = reqwest::header::HeaderMap::new(); + if let Some(api_key) = api_key { + headers.append( + reqwest::header::AUTHORIZATION, + reqwest::header::HeaderValue::from_str(&format!("Bearer {api_key}")) + .context("Invalid authorization header")?, + ); + } + + let client = reqwest::ClientBuilder::new().default_headers(headers); + let client = if let Some(timeout) = timeout { client.timeout(timeout) } else { client }; + let client = client.build()?; + Ok(Self { base_url, client }) + } + + pub fn request(&self, method: reqwest::Method, route: &str) -> reqwest::RequestBuilder { + if let Some(base_url) = &self.base_url { + if route.is_empty() { + self.client.request(method, base_url) + } else { + self.client.request(method, format!("{}/{}", base_url, route)) + } + } else { + self.client.request(method, route) + } + } + + pub fn get(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::GET, route) + } + + pub fn put(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::PUT, route) + } + + pub fn post(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::POST, route) + } + + pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::DELETE, route) + } +} + +/// Run benchmarks from a workload +#[derive(Parser, Debug)] +pub struct BenchDeriveArgs { + /// Filename of the workload file, pass multiple filenames + /// to run multiple workloads in the specified order. + /// + /// Each workload run will get its own report file. + #[arg(value_name = "WORKLOAD_FILE", last = false)] + workload_file: Vec, + + /// URL of the dashboard. + #[arg(long, default_value_t = default_dashboard_url())] + dashboard_url: String, + + /// Directory to output reports. + #[arg(long, default_value_t = default_report_folder())] + report_folder: String, + + /// Directory to store the remote assets. + #[arg(long, default_value_t = default_asset_folder())] + asset_folder: String, + + /// Log directives + #[arg(short, long, default_value_t = default_log_filter())] + log_filter: String, + + /// Benchmark dashboard API key + #[arg(long)] + api_key: Option, + + /// Meilisearch master keys + #[arg(long)] + master_key: Option, + + /// Authentication bearer for fetching assets + #[arg(long)] + assets_key: Option, + + /// Reason for the benchmark invocation + #[arg(short, long)] + reason: Option, +} + +#[derive(Deserialize)] +pub struct Workload { + pub name: String, + pub run_count: u16, + pub extra_cli_args: Vec, + pub assets: BTreeMap, + pub commands: Vec, +} + +#[derive(Deserialize, Clone)] +pub struct Asset { + pub local_location: Option, + pub remote_location: Option, + #[serde(default)] + pub format: AssetFormat, + pub sha256: Option, +} + +#[derive(Deserialize, Default, Copy, Clone)] +pub enum AssetFormat { + #[default] + Auto, + Json, + NdJson, + Raw, +} +impl AssetFormat { + fn to_content_type(self, filename: &str) -> &'static str { + match self { + AssetFormat::Auto => Self::auto_detect(filename).to_content_type(filename), + AssetFormat::Json => "application/json", + AssetFormat::NdJson => "application/x-ndjson", + AssetFormat::Raw => "application/octet-stream", + } + } + + fn auto_detect(filename: &str) -> Self { + let path = std::path::Path::new(filename); + match path.extension().and_then(|extension| extension.to_str()) { + Some(extension) if extension.eq_ignore_ascii_case("json") => Self::Json, + Some(extension) if extension.eq_ignore_ascii_case("ndjson") => Self::NdJson, + extension => { + tracing::warn!(asset = filename, ?extension, "asset has format `Auto`, but extension was not recognized. Specify `Raw` format to suppress this warning."); + AssetFormat::Raw + } + } + } +} + +#[derive(Clone, Deserialize)] +pub struct Command { + pub route: String, + pub method: Method, + #[serde(default)] + pub body: Body, + #[serde(default)] + pub synchronous: SyncMode, +} + +#[derive(Default, Clone, Deserialize)] +#[serde(untagged)] +pub enum Body { + Inline { + inline: serde_json::Value, + }, + Asset { + asset: String, + }, + #[default] + Empty, +} + +impl Body { + pub fn get( + self, + assets: &BTreeMap, + asset_folder: &str, + ) -> anyhow::Result, &'static str)>> { + Ok(match self { + Body::Inline { inline: body } => Some(( + serde_json::to_vec(&body) + .context("serializing to bytes") + .context("while getting inline body")?, + "application/json", + )), + Body::Asset { asset: name } => Some({ + let context = || format!("while getting body from asset '{name}'"); + let (mut file, format) = + fetch_asset(&name, assets, asset_folder).with_context(context)?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf).with_context(context)?; + (buf, format.to_content_type(&name)) + }), + Body::Empty => None, + }) + } +} + +fn fetch_asset( + name: &str, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<(std::fs::File, AssetFormat)> { + let asset = + assets.get(name).with_context(|| format!("could not find asset with name '{name}'"))?; + let filename = if let Some(local_filename) = &asset.local_location { + local_filename.clone() + } else { + format!("{asset_folder}/{name}") + }; + + Ok(( + std::fs::File::open(&filename) + .with_context(|| format!("could not open asset '{name}' at '{filename}'"))?, + asset.format, + )) +} + +impl Display for Command { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?} {} ({:?})", self.method, self.route, self.synchronous) + } +} + +#[derive(Debug, Clone, Copy, Deserialize)] +pub enum Method { + GET, + POST, + PATCH, + DELETE, + PUT, +} + +impl From for reqwest::Method { + fn from(value: Method) -> Self { + match value { + Method::GET => Self::GET, + Method::POST => Self::POST, + Method::PATCH => Self::PATCH, + Method::DELETE => Self::DELETE, + Method::PUT => Self::PUT, + } + } +} + +#[derive(Default, Debug, Clone, Copy, Deserialize)] +pub enum SyncMode { + DontWait, + #[default] + WaitForResponse, + WaitForTask, +} + +pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { + let filter: tracing_subscriber::filter::Targets = + args.log_filter.parse().context("invalid --log-filter")?; + + let env = env_info::Environment::generate_from_current_config(); + let (source, commit_date) = + env_info::Source::from_repo(".").context("could not get repository information")?; + + let subscriber = tracing_subscriber::registry().with( + tracing_subscriber::fmt::layer() + .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) + .with_filter(filter), + ); + tracing::subscriber::set_global_default(subscriber).context("could not setup logging")?; + + let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?; + let _scope = rt.enter(); + + let assets_client = + Client::new(None, args.assets_key.as_deref(), Some(std::time::Duration::from_secs(3600)))?; // 1h + + let dashboard_client = Client::new( + Some(format!("{}/api/v1", args.dashboard_url)), + args.api_key.as_deref(), + Some(std::time::Duration::from_secs(60)), + )?; + + // reporting uses its own client because keeping the stream open to wait for entries + // blocks any other requests + // Also we don't want any pesky timeout because we don't know how much time it will take to recover the full trace + let logs_client = Client::new( + Some("http://127.0.0.1:7700/logs/stream".into()), + args.master_key.as_deref(), + None, + )?; + + let meili_client = Client::new( + Some("http://127.0.0.1:7700".into()), + args.master_key.as_deref(), + Some(std::time::Duration::from_secs(60)), + )?; + + rt.block_on(async { + let response = dashboard_client + .put("machine") + .json(&json!({"hostname": env.hostname})) + .send() + .await + .context("sending machine information")?; + if !response.status().is_success() { + bail!( + "could not send machine information: {} {}", + response.status(), + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + + let commit_message = source.commit_msg.split('\n').next().unwrap(); + let max_workloads = args.workload_file.len(); + let reason: Option<&str> = args.reason.as_deref(); + let response = dashboard_client + .put("invocation") + .json(&json!({ + "commit": { + "sha1": source.commit_id, + "message": commit_message, + "commit_date": commit_date, + "branch": source.branch_or_tag + }, + "machine_hostname": env.hostname, + "max_workloads": max_workloads, + "reason": reason + })) + .send() + .await + .context("sending invocation")?; + + if !response.status().is_success() { + bail!( + "could not send new invocation: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + + let invocation_uuid: Uuid = + response.json().await.context("could not deserialize invocation response as JSON")?; + + + + tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); + let workload_runs = tokio::spawn( + { + let dashboard_client = dashboard_client.clone(); + async move { + for workload_file in args.workload_file.iter() { + let workload: Workload = serde_json::from_reader( + std::fs::File::open(workload_file) + .with_context(|| format!("error opening {}", workload_file.display()))?, + ) + .with_context(|| format!("error parsing {} as JSON", workload_file.display()))?; + + run_workload( + &assets_client, + &dashboard_client, + &logs_client, + &meili_client, + invocation_uuid, + args.master_key.as_deref(), + workload, + &args, + ) + .await?; + } + Ok::<(), anyhow::Error>(()) + }}); + + let abort_handle = workload_runs.abort_handle(); + + tokio::spawn({ + let dashboard_client = dashboard_client.clone(); + async move { + tracing::info!("press Ctrl-C to cancel the invocation"); + match tokio::signal::ctrl_c().await { + Ok(()) => { + tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); + mark_as_failed(dashboard_client, invocation_uuid, None).await; + abort_handle.abort(); + } + Err(error) => tracing::warn!( + error = &error as &dyn std::error::Error, + "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" + ), + } + } + }); + + match workload_runs.await { + Ok(Ok(_)) => { + tracing::info!("Success"); + Ok::<(), anyhow::Error>(()) + } + Ok(Err(error)) => { + tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); + mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await; + tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); + Err(error) + }, + Err(join_error) => { + match join_error.try_into_panic() { + Ok(panic) => { + tracing::error!("invocation panicked, attempting to report the failure to dashboard"); + mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await; + std::panic::resume_unwind(panic) + } + Err(_) => { + tracing::warn!("task was canceled"); + Ok(()) + } + } + }, + } + + })?; + + Ok(()) +} + +async fn mark_as_failed( + dashboard_client: Client, + invocation_uuid: Uuid, + failure_reason: Option, +) { + let response = dashboard_client + .post("cancel-invocation") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "failure_reason": failure_reason, + })) + .send() + .await; + let response = match response { + Ok(response) => response, + Err(response_error) => { + tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); + return; + } + }; + + if !response.status().is_success() { + tracing::error!( + %invocation_uuid, + "could not mark invocation as failed: {}", + response.text().await.unwrap() + ); + return; + } + tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))] +async fn run_workload( + assets_client: &Client, + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + invocation_uuid: Uuid, + master_key: Option<&str>, + workload: Workload, + args: &BenchDeriveArgs, +) -> anyhow::Result<()> { + fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; + + let response = dashboard_client + .put("workload") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "name": &workload.name, + "max_runs": workload.run_count, + })) + .send() + .await + .context("could not create new workload")?; + + if !response.status().is_success() { + bail!("creating new workload failed: {}", response.text().await.unwrap()) + } + + let workload_uuid: Uuid = + response.json().await.context("could not deserialize JSON as UUID")?; + + let mut tasks = Vec::new(); + + for i in 0..workload.run_count { + tasks.push( + run_workload_run( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + master_key, + &workload, + args, + i, + ) + .await?, + ); + } + + let mut reports = Vec::with_capacity(workload.run_count as usize); + + for task in tasks { + reports.push( + task.await + .context("task panicked while processing report")? + .context("task failed while processing report")?, + ); + } + + tracing::info!(workload = workload.name, "Successful workload"); + + Ok(()) +} + +#[tracing::instrument(skip(client, assets), fields(asset_count = assets.len()))] +async fn fetch_assets( + client: &Client, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let mut download_tasks = tokio::task::JoinSet::new(); + for (name, asset) in assets { + // trying local + if let Some(local) = &asset.local_location { + match std::fs::File::open(local) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = local, "found local resource for asset but hash differed, skipping to asset store"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking local resource, skipping to asset store" + ), + }, + } + } + + // checking asset store + let store_filename = format!("{}/{}", asset_folder, name); + + match std::fs::File::open(&store_filename) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = store_filename, "found resource for asset in asset store, but hash differed, skipping to remote method"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking resource in store, skipping to remote method" + ), + }, + } + + // downloading remote + match &asset.remote_location { + Some(location) => { + std::fs::create_dir_all(asset_folder).with_context(|| format!("could not create asset folder at {asset_folder}"))?; + download_tasks.spawn({ + let client = client.clone(); + let name = name.to_string(); + let location = location.to_string(); + let store_filename = store_filename.clone(); + let asset = asset.clone(); + download_asset(client, name, asset, location, store_filename)}); + }, + None => bail!("asset {name} has no remote location, but was not found locally or in the asset store"), + } + } + + while let Some(res) = download_tasks.join_next().await { + res.context("download task panicked")?.context("download task failed")?; + } + + Ok(()) +} + +fn check_sha256(name: &str, asset: &Asset, mut file: std::fs::File) -> anyhow::Result { + let mut bytes = Vec::new(); + file.read_to_end(&mut bytes).with_context(|| format!("hashing file for asset {name}"))?; + let mut file_hash = sha2::Sha256::new(); + file_hash.update(&bytes); + let file_hash = file_hash.finalize(); + let file_hash = format!("{:x}", file_hash); + tracing::debug!(hash = file_hash, "hashed local file"); + + Ok(match &asset.sha256 { + Some(hash) => { + tracing::debug!(hash, "hash from workload"); + if hash.to_ascii_lowercase() == file_hash { + true + } else { + tracing::warn!( + file_hash, + asset_hash = hash.to_ascii_lowercase(), + "hashes don't match" + ); + false + } + } + None => { + tracing::warn!(sha256 = file_hash, "Skipping hash for asset {name} that doesn't have one. Please add it to workload file"); + true + } + }) +} + +#[tracing::instrument(skip(client, asset, name), fields(asset = name))] +async fn download_asset( + client: Client, + name: String, + asset: Asset, + src: String, + dest_filename: String, +) -> anyhow::Result<()> { + let context = || format!("failure downloading asset {name} from {src}"); + + let response = client.get(&src).send().await.with_context(context)?; + + let file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&dest_filename) + .with_context(|| format!("creating destination file {dest_filename}")) + .with_context(context)?; + + let mut dest = std::io::BufWriter::new( + file.try_clone().context("cloning I/O handle").with_context(context)?, + ); + + let total_len: Option = response + .headers() + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse().ok()); + + let progress = tokio::spawn({ + let name = name.clone(); + async move { + loop { + match file.metadata().context("could not get file metadata") { + Ok(metadata) => { + let len = metadata.len(); + tracing::info!( + asset = name, + downloaded_bytes = len, + total_bytes = total_len, + "asset download in progress" + ); + } + Err(error) => { + tracing::warn!(%error, "could not get file metadata"); + } + } + tokio::time::sleep(std::time::Duration::from_secs(60)).await; + } + } + }); + + let writing_context = || format!("while writing to destination file at {dest_filename}"); + + let mut response = response.bytes_stream(); + + while let Some(bytes) = + response.try_next().await.context("while downloading file").with_context(context)? + { + dest.write_all(&bytes).with_context(writing_context).with_context(context)?; + } + + progress.abort(); + + let mut file = dest.into_inner().with_context(writing_context).with_context(context)?; + + file.rewind().context("while rewinding asset file")?; + + if !check_sha256(&name, &asset, file)? { + bail!("asset '{name}': sha256 mismatch for file {dest_filename} downloaded from {src}") + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))] +async fn run_workload_run( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + master_key: Option<&str>, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + delete_db(); + build_meilisearch().await?; + let meilisearch = + start_meilisearch(meili_client, master_key, workload, &args.asset_folder).await?; + let processor = run_commands( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + workload, + args, + run_number, + ) + .await?; + + kill_meilisearch(meilisearch).await; + + tracing::info!(run_number, "Successful run"); + + Ok(processor) +} + +async fn kill_meilisearch(mut meilisearch: tokio::process::Child) { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } +} + +#[tracing::instrument] +async fn build_meilisearch() -> anyhow::Result<()> { + let mut command = tokio::process::Command::new("cargo"); + command.arg("build").arg("--release").arg("-p").arg("meilisearch"); + + command.kill_on_drop(true); + + let mut builder = command.spawn().context("error building Meilisearch")?; + + if !builder.wait().await.context("could not build Meilisearch")?.success() { + bail!("failed building Meilisearch") + } + + Ok(()) +} + +#[tracing::instrument(skip(client, master_key, workload), fields(workload = workload.name))] +async fn start_meilisearch( + client: &Client, + master_key: Option<&str>, + workload: &Workload, + asset_folder: &str, +) -> anyhow::Result { + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + + command.arg("--db-path").arg("./_xtask_benchmark.ms"); + if let Some(master_key) = master_key { + command.arg("--master-key").arg(master_key); + } + command.arg("--experimental-enable-logs-route"); + + for extra_arg in workload.extra_cli_args.iter() { + command.arg(extra_arg); + } + + command.kill_on_drop(true); + + let mut meilisearch = command.spawn().context("Error starting Meilisearch")?; + + wait_for_health(client, &mut meilisearch, &workload.assets, asset_folder).await?; + + Ok(meilisearch) +} + +async fn wait_for_health( + client: &Client, + meilisearch: &mut tokio::process::Child, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + for i in 0..100 { + let res = run_command(client.clone(), health_command(), assets, asset_folder).await; + if res.is_ok() { + // check that this is actually the current Meilisearch instance that answered us + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + tracing::error!("Got an health response from a different process"); + bail!("Meilisearch server exited early with code {exit_code}"); + } + + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + // check whether the Meilisearch instance exited early (cut the wait) + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + bail!("Meilisearch server exited early with code {exit_code}"); + } + tracing::debug!(attempt = i, "Waiting for Meilisearch to go up"); + } + bail!("meilisearch is not responding") +} + +fn health_command() -> Command { + Command { + route: "/health".into(), + method: Method::GET, + body: Default::default(), + synchronous: SyncMode::WaitForResponse, + } +} + +fn delete_db() { + let _ = std::fs::remove_dir_all("./_xtask_benchmark.ms"); +} + +async fn run_commands( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + let report_folder = &args.report_folder; + let workload_name = &workload.name; + + std::fs::create_dir_all(report_folder) + .with_context(|| format!("could not create report directory at {report_folder}"))?; + + let trace_filename = format!("{report_folder}/{workload_name}-{run_number}-trace.json"); + let report_filename = format!("{report_folder}/{workload_name}-{run_number}-report.json"); + + let report_handle = start_report(logs_client, trace_filename).await?; + + for batch in workload + .commands + .as_slice() + .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) + { + run_batch(meili_client, batch, &workload.assets, &args.asset_folder).await?; + } + + let processor = + stop_report(dashboard_client, logs_client, workload_uuid, report_filename, report_handle) + .await?; + + Ok(processor) +} + +async fn stop_report( + dashboard_client: &Client, + logs_client: &Client, + workload_uuid: Uuid, + filename: String, + report_handle: tokio::task::JoinHandle>, +) -> anyhow::Result>> { + let response = logs_client.delete("").send().await.context("while stopping report")?; + if !response.status().is_success() { + bail!("received HTTP {} while stopping report", response.status()) + } + + let mut file = tokio::time::timeout(std::time::Duration::from_secs(1000), report_handle) + .await + .context("while waiting for the end of the report")? + .context("report writing task panicked")? + .context("while writing report")?; + + file.rewind().context("while rewinding report file")?; + + let process_handle = tokio::task::spawn({ + let dashboard_client = dashboard_client.clone(); + async move { + let span = tracing::info_span!("processing trace to report", filename); + let _guard = span.enter(); + let report = tracing_trace::processor::span_stats::to_call_stats( + tracing_trace::TraceReader::new(std::io::BufReader::new(file)), + ) + .context("could not convert trace to report")?; + let context = || format!("writing report to {filename}"); + + let response = dashboard_client + .put("run") + .json(&json!({ + "workload_uuid": workload_uuid, + "data": report + })) + .send() + .await + .context("sending new run")?; + + if !response.status().is_success() { + bail!( + "sending new run failed: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ) + } + + let mut output_file = std::io::BufWriter::new( + std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(context)?, + ); + + for (key, value) in report { + serde_json::to_writer(&mut output_file, &json!({key: value})) + .context("serializing span stat")?; + writeln!(&mut output_file).with_context(context)?; + } + output_file.flush().with_context(context)?; + let mut output_file = output_file.into_inner().with_context(context)?; + + output_file.rewind().context("could not rewind output_file").with_context(context)?; + + tracing::info!("success"); + Ok(output_file) + } + }); + + Ok(process_handle) +} + +async fn start_report( + logs_client: &Client, + filename: String, +) -> anyhow::Result>> { + let report_file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(|| format!("could not create file at {filename}"))?; + let mut report_file = std::io::BufWriter::new(report_file); + + let response = logs_client + .post("") + .json(&json!({ + "mode": "profile", + "target": "indexing::=trace" + })) + .send() + .await + .context("failed to start report")?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%code, "request error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error when trying to start report")?; + bail!( + "request error when trying to start report: server responded with error code {code} and '{response}'" + ) + } else if code.is_server_error() { + tracing::error!(%code, "server error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error trying to start report")?; + bail!("server error when trying to start report: server responded with error code {code} and '{response}'") + } + + Ok(tokio::task::spawn(async move { + let mut stream = response.bytes_stream(); + while let Some(bytes) = stream.try_next().await.context("while waiting for report")? { + report_file + .write_all(&bytes) + .with_context(|| format!("while writing report to {filename}"))?; + } + report_file.into_inner().with_context(|| format!("while writing report to {filename}")) + })) +} + +async fn run_batch( + client: &Client, + batch: &[Command], + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let [.., last] = batch else { return Ok(()) }; + let sync = last.synchronous; + + let mut tasks = tokio::task::JoinSet::new(); + + for command in batch { + // FIXME: you probably don't want to copy assets everytime here + tasks.spawn({ + let client = client.clone(); + let command = command.clone(); + let assets = assets.clone(); + let asset_folder = asset_folder.to_owned(); + + async move { run_command(client, command, &assets, &asset_folder).await } + }); + } + + while let Some(result) = tasks.join_next().await { + result + .context("panicked while executing command")? + .context("error while executing command")?; + } + + match sync { + SyncMode::DontWait => {} + SyncMode::WaitForResponse => {} + SyncMode::WaitForTask => wait_for_tasks(client).await?, + } + + Ok(()) +} + +async fn wait_for_tasks(client: &Client) -> anyhow::Result<()> { + loop { + let response = client + .get("tasks?statuses=enqueued,processing") + .send() + .await + .context("could not wait for tasks")?; + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response to JSON") + .context("could not wait for tasks")?; + match response.get("total") { + Some(serde_json::Value::Number(number)) => { + let number = number.as_u64().with_context(|| { + format!("waiting for tasks: could not parse 'total' as integer, got {}", number) + })?; + if number == 0 { + break; + } else { + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + continue; + } + } + Some(thing_else) => { + bail!(format!( + "waiting for tasks: could not parse 'total' as a number, got '{thing_else}'" + )) + } + None => { + bail!(format!( + "waiting for tasks: expected response to contain 'total', got '{response}'" + )) + } + } + } + Ok(()) +} + +#[tracing::instrument(skip(client, command, assets, asset_folder), fields(command = %command))] +async fn run_command( + client: Client, + mut command: Command, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + // memtake the body here to leave an empty body in its place, so that command is not partially moved-out + let body = std::mem::take(&mut command.body) + .get(assets, asset_folder) + .with_context(|| format!("while getting body for command {command}"))?; + + let request = client.request(command.method.into(), &command.route); + + let request = if let Some((body, content_type)) = body { + request.body(body).header(reqwest::header::CONTENT_TYPE, content_type) + } else { + request + }; + + let response = + request.send().await.with_context(|| format!("error sending command: {}", command))?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%command, %code, "error in workload file"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing error in workload file when sending command")?; + bail!("error in workload file: server responded with error code {code} and '{response}'") + } else if code.is_server_error() { + tracing::error!(%command, %code, "server error"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing server error when sending command")?; + bail!("server error: server responded with error code {code} and '{response}'") + } + + Ok(()) +} diff --git a/xtask/src/lib.rs b/xtask/src/lib.rs new file mode 100644 index 000000000..cbda260db --- /dev/null +++ b/xtask/src/lib.rs @@ -0,0 +1 @@ +pub mod bench; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 6570dc67b..b81424666 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use clap::Parser; +use xtask::bench::BenchDeriveArgs; /// List features available in the workspace #[derive(Parser, Debug)] @@ -17,13 +18,16 @@ struct ListFeaturesDeriveArgs { #[command(bin_name = "cargo xtask")] enum Command { ListFeatures(ListFeaturesDeriveArgs), + Bench(BenchDeriveArgs), } -fn main() { +fn main() -> anyhow::Result<()> { let args = Command::parse(); match args { Command::ListFeatures(args) => list_features(args), + Command::Bench(args) => xtask::bench::run(args)?, } + Ok(()) } fn list_features(args: ListFeaturesDeriveArgs) { From c608b3f9b5fa037254f7bbbaa1a2e3298087f664 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 27 Feb 2024 18:34:52 +0100 Subject: [PATCH 41/52] Factor vergen stuff to a build-info crate --- Cargo.lock | 149 +++++++++---- Cargo.toml | 2 +- Dockerfile | 2 +- build-info/Cargo.toml | 18 ++ build-info/build.rs | 22 ++ build-info/src/lib.rs | 203 ++++++++++++++++++ meilisearch/Cargo.toml | 2 +- meilisearch/build.rs | 13 -- .../src/analytics/segment_analytics.rs | 4 +- meilisearch/src/lib.rs | 27 --- meilisearch/src/main.rs | 22 +- meilisearch/src/routes/mod.rs | 14 +- xtask/Cargo.toml | 11 +- xtask/src/bench/env_info.rs | 54 ----- xtask/src/bench/mod.rs | 12 +- 15 files changed, 396 insertions(+), 159 deletions(-) create mode 100644 build-info/Cargo.toml create mode 100644 build-info/build.rs create mode 100644 build-info/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index eca3b2fbc..700bb2653 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -356,9 +356,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.79" +version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" dependencies = [ "backtrace", ] @@ -628,6 +628,15 @@ dependencies = [ "serde", ] +[[package]] +name = "build-info" +version = "1.7.0" +dependencies = [ + "anyhow", + "time", + "vergen-git2", +] + [[package]] name = "bumpalo" version = "3.13.0" @@ -1348,7 +1357,16 @@ version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" dependencies = [ - "derive_builder_macro", + "derive_builder_macro 0.12.0", +] + +[[package]] +name = "derive_builder" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f59169f400d8087f238c5c0c7db6a28af18681717f3b623227d92f397e938c7" +dependencies = [ + "derive_builder_macro 0.13.1", ] [[package]] @@ -1363,13 +1381,35 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_core" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ec317cc3e7ef0928b0ca6e4a634a4d6c001672ae210438cf114a83e56b018d" +dependencies = [ + "darling 0.14.4", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_builder_macro" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" dependencies = [ - "derive_builder_core", + "derive_builder_core 0.12.0", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "870368c3fb35b8031abb378861d4460f573b92238ec2152c927a21f77e3e0127" +dependencies = [ + "derive_builder_core 0.13.1", "syn 1.0.109", ] @@ -2088,11 +2128,11 @@ checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" [[package]] name = "git2" -version = "0.16.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf7f68c2995f392c49fffb4f95ae2c873297830eb25c6bc4c114ce8f4562acc" +checksum = "1b3ba52851e73b46a4c3df1d89343741112003f0f6f13beb0dfac9e457c3fdcd" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "libc", "libgit2-sys", "log", @@ -2389,7 +2429,7 @@ dependencies = [ "bincode", "crossbeam", "csv", - "derive_builder", + "derive_builder 0.12.0", "dump", "enum-iterator", "file-store", @@ -2506,7 +2546,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", - "rustix 0.38.26", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -2628,15 +2668,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libgit2-sys" -version = "0.14.2+1.5.1" +version = "0.16.2+1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" +checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" dependencies = [ "cc", "libc", @@ -2683,9 +2723,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6" dependencies = [ "cc", "libc", @@ -3122,6 +3162,7 @@ dependencies = [ "async-trait", "brotli", "bstr", + "build-info", "byte-unit", "bytes", "cargo_toml", @@ -3193,7 +3234,6 @@ dependencies = [ "url", "urlencoding", "uuid", - "vergen", "walkdir", "yaup", "zip", @@ -3530,6 +3570,15 @@ dependencies = [ "libc", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" @@ -4144,15 +4193,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_users" version = "0.4.3" @@ -4343,9 +4383,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.26" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ "bitflags 2.4.1", "errno", @@ -4881,14 +4921,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", - "rustix 0.38.26", + "rustix 0.38.31", "windows-sys 0.52.0", ] @@ -4948,13 +4987,15 @@ dependencies = [ [[package]] name = "time" -version = "0.3.32" +version = "0.3.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe80ced77cbfb4cb91a94bf72b378b4b6791a0d9b7f09d0be747d1bdff4e68bd" +checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" dependencies = [ "deranged", "itoa", + "libc", "num-conv", + "num_threads", "powerfmt", "serde", "time-core", @@ -5008,7 +5049,7 @@ version = "0.14.1" source = "git+https://github.com/huggingface/tokenizers.git?tag=v0.14.1#6357206cdcce4d78ffb1e0372feb456caea09375" dependencies = [ "aho-corasick", - "derive_builder", + "derive_builder 0.12.0", "esaxx-rs", "getrandom", "itertools 0.11.0", @@ -5434,18 +5475,42 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vergen" -version = "7.5.1" +version = "9.0.0-beta.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f21b881cd6636ece9735721cf03c1fe1e774fe258683d084bb2812ab67435749" +checksum = "107dc53b443fe8cc380798abb75ad6b7038281165109afea1f1b28bb47047ed5" dependencies = [ "anyhow", - "cfg-if", - "enum-iterator", + "derive_builder 0.13.1", "getset", + "rustversion", + "vergen-lib", +] + +[[package]] +name = "vergen-git2" +version = "1.0.0-beta.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8875c5d71074bb67118774e3d795ab6fe77c3ae3161cb54e19104cabc49487f1" +dependencies = [ + "anyhow", + "derive_builder 0.13.1", "git2", "rustversion", - "thiserror", "time", + "vergen", + "vergen-lib", +] + +[[package]] +name = "vergen-lib" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26ebfba72ba904559f25f41ea1512335b5a46459084258cea0857549d9645187" +dependencies = [ + "anyhow", + "derive_builder 0.13.1", + "getset", + "rustversion", ] [[package]] @@ -5873,9 +5938,9 @@ checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "winnow" -version = "0.5.4" +version = "0.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acaaa1190073b2b101e15083c38ee8ec891b5e05cbee516521e94ec008f61e64" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" dependencies = [ "memchr", ] @@ -5904,11 +5969,11 @@ name = "xtask" version = "1.7.0" dependencies = [ "anyhow", + "build-info", "cargo_metadata", "clap", "futures-core", "futures-util", - "git2", "reqwest", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 11190025a..1d79fd196 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ members = [ "benchmarks", "fuzzers", "tracing-trace", - "xtask", + "xtask", "build-info", ] [workspace.package] diff --git a/Dockerfile b/Dockerfile index dd2cfc134..5b227e6fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ WORKDIR / ARG COMMIT_SHA ARG COMMIT_DATE ARG GIT_TAG -ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG} +ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_DESCRIBE=${GIT_TAG} ENV RUSTFLAGS="-C target-feature=-crt-static" COPY . . diff --git a/build-info/Cargo.toml b/build-info/Cargo.toml new file mode 100644 index 000000000..50854a642 --- /dev/null +++ b/build-info/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "build-info" +version.workspace = true +authors.workspace = true +description.workspace = true +homepage.workspace = true +readme.workspace = true +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +time = { version = "0.3.34", features = ["parsing"] } + +[build-dependencies] +anyhow = "1.0.80" +vergen-git2 = "1.0.0-beta.2" diff --git a/build-info/build.rs b/build-info/build.rs new file mode 100644 index 000000000..b1ec0ab47 --- /dev/null +++ b/build-info/build.rs @@ -0,0 +1,22 @@ +fn main() { + if let Err(err) = emit_git_variables() { + println!("cargo:warning=vergen: {}", err); + } +} + +fn emit_git_variables() -> anyhow::Result<()> { + // Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them + // in the corresponding GitHub workflow (publish_docker.yml). + // This is due to the Dockerfile building the binary outside of the git directory. + let mut builder = vergen_git2::Git2Builder::default(); + + builder.branch(true); + builder.commit_timestamp(true); + builder.commit_message(true); + builder.describe(true, true, None); + builder.sha(false); + + let git2 = builder.build()?; + + vergen_git2::Emitter::default().fail_on_error().add_instructions(&git2)?.emit() +} diff --git a/build-info/src/lib.rs b/build-info/src/lib.rs new file mode 100644 index 000000000..cfcefb4a2 --- /dev/null +++ b/build-info/src/lib.rs @@ -0,0 +1,203 @@ +use time::format_description::well_known::Iso8601; + +#[derive(Debug, Clone)] +pub struct BuildInfo { + pub branch: Option<&'static str>, + pub describe: Option, + pub commit_sha1: Option<&'static str>, + pub commit_msg: Option<&'static str>, + pub commit_timestamp: Option, +} + +impl BuildInfo { + pub fn from_build() -> Self { + let branch: Option<&'static str> = option_env!("VERGEN_GIT_BRANCH"); + let describe = DescribeResult::from_build(); + let commit_sha1 = option_env!("VERGEN_GIT_SHA"); + let commit_msg = option_env!("VERGEN_GIT_COMMIT_MESSAGE"); + let commit_timestamp = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP"); + + let commit_timestamp = commit_timestamp.and_then(|commit_timestamp| { + time::OffsetDateTime::parse(commit_timestamp, &Iso8601::DEFAULT).ok() + }); + + Self { branch, describe, commit_sha1, commit_msg, commit_timestamp } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DescribeResult { + Prototype { name: &'static str }, + Release { version: &'static str, major: u64, minor: u64, patch: u64 }, + Prerelease { version: &'static str, major: u64, minor: u64, patch: u64, rc: u64 }, + NotATag { describe: &'static str }, +} + +impl DescribeResult { + pub fn new(describe: &'static str) -> Self { + if let Some(name) = prototype_name(describe) { + Self::Prototype { name } + } else if let Some(release) = release_version(describe) { + release + } else if let Some(prerelease) = prerelease_version(describe) { + prerelease + } else { + Self::NotATag { describe } + } + } + + pub fn from_build() -> Option { + let describe: &'static str = option_env!("VERGEN_GIT_DESCRIBE")?; + Some(Self::new(describe)) + } + + pub fn as_tag(&self) -> Option<&'static str> { + match self { + DescribeResult::Prototype { name } => Some(name), + DescribeResult::Release { version, .. } => Some(version), + DescribeResult::Prerelease { version, .. } => Some(version), + DescribeResult::NotATag { describe: _ } => None, + } + } + + pub fn as_prototype(&self) -> Option<&'static str> { + match self { + DescribeResult::Prototype { name } => Some(name), + DescribeResult::Release { .. } + | DescribeResult::Prerelease { .. } + | DescribeResult::NotATag { .. } => None, + } + } +} + +/// Parses the input as a prototype name. +/// +/// Returns `Some(prototype_name)` if the following conditions are met on this value: +/// +/// 1. starts with `prototype-`, +/// 2. ends with `-`, +/// 3. does not end with `-`. +/// +/// Otherwise, returns `None`. +fn prototype_name(describe: &'static str) -> Option<&'static str> { + if !describe.starts_with("prototype-") { + return None; + } + + let mut rsplit_prototype = describe.rsplit('-'); + // last component MUST be a number + rsplit_prototype.next()?.parse::().ok()?; + // before than last component SHALL NOT be a number + rsplit_prototype.next()?.parse::().err()?; + + Some(describe) +} + +fn release_version(describe: &'static str) -> Option { + if !describe.starts_with('v') { + return None; + } + + // full release version don't contain a `-` + if describe.contains('-') { + return None; + } + + // full release version parse as vX.Y.Z, with X, Y, Z numbers. + let mut dots = describe[1..].split('.'); + let major: u64 = dots.next()?.parse().ok()?; + let minor: u64 = dots.next()?.parse().ok()?; + let patch: u64 = dots.next()?.parse().ok()?; + + if dots.next().is_some() { + return None; + } + + Some(DescribeResult::Release { version: describe, major, minor, patch }) +} + +fn prerelease_version(describe: &'static str) -> Option { + // prerelease version is in the shape vM.N.P-rc.C + let mut hyphen = describe.rsplit('-'); + let prerelease = hyphen.next()?; + if !prerelease.starts_with("rc.") { + return None; + } + + let rc: u64 = prerelease[3..].parse().ok()?; + + let release = hyphen.next()?; + + let DescribeResult::Release { version: _, major, minor, patch } = release_version(release)? + else { + return None; + }; + + Some(DescribeResult::Prerelease { version: describe, major, minor, patch, rc }) +} + +#[cfg(test)] +mod test { + use super::DescribeResult; + + fn assert_not_a_tag(describe: &'static str) { + assert_eq!(DescribeResult::NotATag { describe }, DescribeResult::new(describe)) + } + + fn assert_proto(describe: &'static str) { + assert_eq!(DescribeResult::Prototype { name: describe }, DescribeResult::new(describe)) + } + + fn assert_release(describe: &'static str, major: u64, minor: u64, patch: u64) { + assert_eq!( + DescribeResult::Release { version: describe, major, minor, patch }, + DescribeResult::new(describe) + ) + } + + fn assert_prerelease(describe: &'static str, major: u64, minor: u64, patch: u64, rc: u64) { + assert_eq!( + DescribeResult::Prerelease { version: describe, major, minor, patch, rc }, + DescribeResult::new(describe) + ) + } + + #[test] + fn not_a_tag() { + assert_not_a_tag("whatever-fuzzy"); + assert_not_a_tag("whatever-fuzzy-5-ggg-dirty"); + assert_not_a_tag("whatever-fuzzy-120-ggg-dirty"); + + // technically a tag, but not a proto nor a version, so not parsed as a tag + assert_not_a_tag("whatever"); + + // dirty version + assert_not_a_tag("v1.7.0-1-ggga-dirty"); + assert_not_a_tag("v1.7.0-rc.1-1-ggga-dirty"); + + // after version + assert_not_a_tag("v1.7.0-1-ggga"); + assert_not_a_tag("v1.7.0-rc.1-1-ggga"); + + // after proto + assert_not_a_tag("protoype-tag-0-1-ggga"); + assert_not_a_tag("protoype-tag-0-1-ggga-dirty"); + } + + #[test] + fn prototype() { + assert_proto("prototype-tag-0"); + assert_proto("prototype-tag-10"); + assert_proto("prototype-long-name-tag-10"); + } + + #[test] + fn release() { + assert_release("v1.7.2", 1, 7, 2); + } + + #[test] + fn prerelease() { + assert_prerelease("v1.7.2-rc.3", 1, 7, 2, 3); + } +} diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index fc4f5aa8b..b65c466ca 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -107,6 +107,7 @@ tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.9" +build-info = { version = "1.7.0", path = "../build-info" } [dev-dependencies] actix-rt = "2.9.0" @@ -131,7 +132,6 @@ reqwest = { version = "0.11.23", features = [ sha-1 = { version = "0.10.1", optional = true } static-files = { version = "0.2.3", optional = true } tempfile = { version = "3.9.0", optional = true } -vergen = { version = "7.5.1", default-features = false, features = ["git"] } zip = { version = "0.6.6", optional = true } [features] diff --git a/meilisearch/build.rs b/meilisearch/build.rs index c839b6e33..dc24b0449 100644 --- a/meilisearch/build.rs +++ b/meilisearch/build.rs @@ -1,17 +1,4 @@ -use vergen::{vergen, Config, SemverKind}; - fn main() { - // Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them - // in the corresponding GitHub workflow (publish_docker.yml). - // This is due to the Dockerfile building the binary outside of the git directory. - let mut config = Config::default(); - // allow using non-annotated tags - *config.git_mut().semver_kind_mut() = SemverKind::Lightweight; - - if let Err(e) = vergen(config) { - println!("cargo:warning=vergen: {}", e); - } - #[cfg(feature = "mini-dashboard")] mini_dashboard::setup_mini_dashboard().expect("Could not load the mini-dashboard assets"); } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 55ddb4747..7dfc52900 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -473,7 +473,9 @@ impl Segment { create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default()) { // Replace the version number with the prototype name if any. - let version = if let Some(prototype) = crate::prototype_name() { + let version = if let Some(prototype) = build_info::DescribeResult::from_build() + .and_then(|describe| describe.as_prototype()) + { prototype } else { env!("CARGO_PKG_VERSION") diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 9d9274b9d..820f1ae42 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -536,30 +536,3 @@ pub fn dashboard(config: &mut web::ServiceConfig, enable_frontend: bool) { pub fn dashboard(config: &mut web::ServiceConfig, _enable_frontend: bool) { config.service(web::resource("/").route(web::get().to(routes::running))); } - -/// Parses the output of -/// [`VERGEN_GIT_SEMVER_LIGHTWEIGHT`](https://docs.rs/vergen/latest/vergen/struct.Git.html#instructions) -/// as a prototype name. -/// -/// Returns `Some(prototype_name)` if the following conditions are met on this value: -/// -/// 1. starts with `prototype-`, -/// 2. ends with `-`, -/// 3. does not end with `-`. -/// -/// Otherwise, returns `None`. -pub fn prototype_name() -> Option<&'static str> { - let prototype: &'static str = option_env!("VERGEN_GIT_SEMVER_LIGHTWEIGHT")?; - - if !prototype.starts_with("prototype-") { - return None; - } - - let mut rsplit_prototype = prototype.rsplit('-'); - // last component MUST be a number - rsplit_prototype.next()?.parse::().ok()?; - // before than last component SHALL NOT be a number - rsplit_prototype.next()?.parse::().err()?; - - Some(prototype) -} diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index f1f93dd99..79ca7ec80 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -12,8 +12,8 @@ use is_terminal::IsTerminal; use meilisearch::analytics::Analytics; use meilisearch::option::LogMode; use meilisearch::{ - analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, - LogStderrHandle, LogStderrType, Opt, SubscriberForSecondLayer, + analytics, create_app, setup_meilisearch, LogRouteHandle, LogRouteType, LogStderrHandle, + LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; use mimalloc::MiMalloc; @@ -163,8 +163,8 @@ pub fn print_launch_resume( analytics: Arc, config_read_from: Option, ) { - let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"); - let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown"); + let build_info = build_info::BuildInfo::from_build(); + let protocol = if opt.ssl_cert_path.is_some() && opt.ssl_key_path.is_some() { "https" } else { "http" }; let ascii_name = r#" @@ -189,10 +189,18 @@ pub fn print_launch_resume( eprintln!("Database path:\t\t{:?}", opt.db_path); eprintln!("Server listening on:\t\"{}://{}\"", protocol, opt.http_addr); eprintln!("Environment:\t\t{:?}", opt.env); - eprintln!("Commit SHA:\t\t{:?}", commit_sha.to_string()); - eprintln!("Commit date:\t\t{:?}", commit_date.to_string()); + eprintln!("Commit SHA:\t\t{:?}", build_info.commit_sha1.unwrap_or("unknown")); + eprintln!( + "Commit date:\t\t{:?}", + build_info + .commit_timestamp + .and_then(|commit_timestamp| commit_timestamp + .format(&time::format_description::well_known::Iso8601::DEFAULT) + .ok()) + .unwrap_or("unknown".into()) + ); eprintln!("Package version:\t{:?}", env!("CARGO_PKG_VERSION").to_string()); - if let Some(prototype) = prototype_name() { + if let Some(prototype) = build_info.describe.and_then(|describe| describe.as_prototype()) { eprintln!("Prototype:\t\t{:?}", prototype); } diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index 249103e12..1c1465582 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -359,12 +359,18 @@ async fn get_version( ) -> HttpResponse { analytics.publish("Version Seen".to_string(), json!(null), Some(&req)); - let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"); - let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown"); + let build_info = build_info::BuildInfo::from_build(); HttpResponse::Ok().json(VersionResponse { - commit_sha: commit_sha.to_string(), - commit_date: commit_date.to_string(), + commit_sha: build_info.commit_sha1.unwrap_or("unknown").to_string(), + commit_date: build_info + .commit_timestamp + .and_then(|commit_timestamp| { + commit_timestamp + .format(&time::format_description::well_known::Iso8601::DEFAULT) + .ok() + }) + .unwrap_or("unknown".into()), pkg_version: env!("CARGO_PKG_VERSION").to_string(), }) } diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index a59a79e53..0df8161ce 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -12,11 +12,11 @@ license.workspace = true [dependencies] anyhow = "1.0.79" +build-info = { version = "1.7.0", path = "../build-info" } cargo_metadata = "0.18.1" clap = { version = "4.4.14", features = ["derive"] } futures-core = "0.3.30" futures-util = "0.3.30" -git2 = { version = "0.16", default_features = false } reqwest = { version = "0.11.23", features = [ "stream", "json", @@ -26,7 +26,11 @@ serde = { version = "1.0.195", features = ["derive"] } serde_json = "1.0.111" sha2 = "0.10.8" sysinfo = "0.30.5" -time = { version = "0.3.32", features = ["serde", "serde-human-readable"] } +time = { version = "0.3.32", features = [ + "serde", + "serde-human-readable", + "macros", +] } tokio = { version = "1.35.1", features = [ "rt", "net", @@ -38,3 +42,6 @@ tracing = "0.1.40" tracing-subscriber = "0.3.18" tracing-trace = { version = "0.1.0", path = "../tracing-trace" } uuid = { version = "1.7.0", features = ["v7", "serde"] } + +[build-dependencies] +anyhow = "1.0.79" diff --git a/xtask/src/bench/env_info.rs b/xtask/src/bench/env_info.rs index 5cbeb4274..08dacf915 100644 --- a/xtask/src/bench/env_info.rs +++ b/xtask/src/bench/env_info.rs @@ -1,58 +1,4 @@ use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -#[derive(Debug, Clone, Deserialize, Serialize)] -#[serde(rename_all = "camelCase")] -pub struct Source { - pub repo_url: Option, - pub branch_or_tag: String, - pub commit_id: String, - pub commit_msg: String, - pub author_name: String, - pub author_email: String, - pub committer_name: String, - pub committer_email: String, -} - -impl Source { - pub fn from_repo( - path: impl AsRef, - ) -> Result<(Self, OffsetDateTime), git2::Error> { - use git2::Repository; - - let repo = Repository::open(path)?; - let remote = repo.remotes()?; - let remote = remote.get(0).expect("No remote associated to the repo"); - let remote = repo.find_remote(remote)?; - - let head = repo.head()?; - - let commit = head.peel_to_commit()?; - - let time = OffsetDateTime::from_unix_timestamp(commit.time().seconds()).unwrap(); - - let author = commit.author(); - let committer = commit.committer(); - - Ok(( - Self { - repo_url: remote.url().map(|s| s.to_string()), - branch_or_tag: head.name().unwrap().to_string(), - commit_id: commit.id().to_string(), - commit_msg: String::from_utf8_lossy(commit.message_bytes()) - .to_string() - .lines() - .next() - .map_or(String::new(), |s| s.to_string()), - author_name: author.name().unwrap().to_string(), - author_email: author.email().unwrap().to_string(), - committer_name: committer.name().unwrap().to_string(), - committer_email: committer.email().unwrap().to_string(), - }, - time, - )) - } -} #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs index ea17b6f69..cfc7c124f 100644 --- a/xtask/src/bench/mod.rs +++ b/xtask/src/bench/mod.rs @@ -292,8 +292,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { args.log_filter.parse().context("invalid --log-filter")?; let env = env_info::Environment::generate_from_current_config(); - let (source, commit_date) = - env_info::Source::from_repo(".").context("could not get repository information")?; + let build_info = build_info::BuildInfo::from_build(); let subscriber = tracing_subscriber::registry().with( tracing_subscriber::fmt::layer() @@ -344,17 +343,18 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { ); } - let commit_message = source.commit_msg.split('\n').next().unwrap(); + let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); let response = dashboard_client .put("invocation") .json(&json!({ "commit": { - "sha1": source.commit_id, + "sha1": build_info.commit_sha1, "message": commit_message, - "commit_date": commit_date, - "branch": source.branch_or_tag + "commit_date": build_info.commit_timestamp, + "branch": build_info.branch, + "tag": build_info.describe.and_then(|describe| describe.as_tag()), }, "machine_hostname": env.hostname, "max_workloads": max_workloads, From 55f60a363808d39dd6ee16f9b589df48d0d653f7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 4 Mar 2024 14:29:44 +0100 Subject: [PATCH 42/52] Update .gitignore - Ignore `/bench` directory for git purposes - Ignore benchmark DB --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5f660c735..e00f45c1e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ /data.ms /snapshots /dumps +/bench +/_xtask_benchmark.ms # Snapshots ## ... large From eee46b7537f4a1fad754b9b9106351e765171b0c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 4 Mar 2024 14:31:14 +0100 Subject: [PATCH 43/52] Add first workloads --- workloads/hackernews.json | 164 ++++++++++++++++++++++++++++++++ workloads/movies-nothreads.json | 44 +++++++++ workloads/movies.json | 42 ++++++++ 3 files changed, 250 insertions(+) create mode 100644 workloads/hackernews.json create mode 100644 workloads/movies-nothreads.json create mode 100644 workloads/movies.json diff --git a/workloads/hackernews.json b/workloads/hackernews.json new file mode 100644 index 000000000..0a99b69ff --- /dev/null +++ b/workloads/hackernews.json @@ -0,0 +1,164 @@ +{ + "name": "hackernews.ndjson_1M", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-100_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-100_000.ndjson", + "sha256": "60ecd23485d560edbd90d9ca31f0e6dba1455422f2a44e402600fbb5f7f1b213" + }, + "hackernews-200_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-200_000.ndjson", + "sha256": "785b0271fdb47cba574fab617d5d332276b835c05dd86e4a95251cf7892a1685" + }, + "hackernews-300_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-300_000.ndjson", + "sha256": "de73c7154652eddfaf69cdc3b2f824d5c452f095f40a20a1c97bb1b5c4d80ab2" + }, + "hackernews-400_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-400_000.ndjson", + "sha256": "c1b00a24689110f366447e434c201c086d6f456d54ed1c4995894102794d8fe7" + }, + "hackernews-500_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-500_000.ndjson", + "sha256": "ae98f9dbef8193d750e3e2dbb6a91648941a1edca5f6e82c143e7996f4840083" + }, + "hackernews-600_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-600_000.ndjson", + "sha256": "b495fdc72c4a944801f786400f22076ab99186bee9699f67cbab2f21f5b74dbe" + }, + "hackernews-700_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-700_000.ndjson", + "sha256": "4b2c63974f3dabaa4954e3d4598b48324d03c522321ac05b0d583f36cb78a28b" + }, + "hackernews-800_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-800_000.ndjson", + "sha256": "cb7b6afe0e6caa1be111be256821bc63b0771b2a0e1fad95af7aaeeffd7ba546" + }, + "hackernews-900_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-900_000.ndjson", + "sha256": "e1154ddcd398f1c867758a93db5bcb21a07b9e55530c188a2917fdef332d3ba9" + }, + "hackernews-1_000_000.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-1_000_000.ndjson", + "sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time" + ], + "searchableAttributes": [ + "title" + ], + "filterableAttributes": [ + "by" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-100_000.ndjson" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-200_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-300_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-400_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-500_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-600_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-700_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-800_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-900_000.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-1_000_000.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/movies-nothreads.json b/workloads/movies-nothreads.json new file mode 100644 index 000000000..175daacf9 --- /dev/null +++ b/workloads/movies-nothreads.json @@ -0,0 +1,44 @@ +{ + "name": "movies.json,no-threads", + "run_count": 2, + "extra_cli_args": [ + "--max-indexing-threads=1" + ], + "assets": { + "movies.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json", + "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies.json" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/movies.json b/workloads/movies.json new file mode 100644 index 000000000..445ff3aca --- /dev/null +++ b/workloads/movies.json @@ -0,0 +1,42 @@ +{ + "name": "movies.json", + "run_count": 10, + "extra_cli_args": [], + "assets": { + "movies.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json", + "sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1" + } + }, + "commands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies.json" + }, + "synchronous": "WaitForTask" + } + ] +} From adcd848809647c7167f23b0612c8e69452ed0beb Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 4 Mar 2024 23:03:26 +0100 Subject: [PATCH 44/52] CI: Add bench workflows --- .github/workflows/bench-manual.yml | 30 +++++++++++++++ .github/workflows/bench-pr.yml | 46 +++++++++++++++++++++++ .github/workflows/bench-push-indexing.yml | 25 ++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 .github/workflows/bench-manual.yml create mode 100644 .github/workflows/bench-pr.yml create mode 100644 .github/workflows/bench-push-indexing.yml diff --git a/.github/workflows/bench-manual.yml b/.github/workflows/bench-manual.yml new file mode 100644 index 000000000..6d8c3a006 --- /dev/null +++ b/.github/workflows/bench-manual.yml @@ -0,0 +1,30 @@ +name: Bench (manual) + +on: + workflow_dispatch: + inputs: + workload: + description: 'The path to the workloads to execute (workloads/...)' + required: true + default: 'workloads/movies.json' + +env: + WORKLOAD_NAME: ${{ github.event.inputs.workload }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME} + diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml new file mode 100644 index 000000000..6f4956542 --- /dev/null +++ b/.github/workflows/bench-pr.yml @@ -0,0 +1,46 @@ +name: Bench (PR) +on: + issue_comment: + types: [created] + +permissions: + issues: write + +env: + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + +jobs: + run-benchmarks-on-comment: + if: startsWith(github.event.comment.body, '/bench') + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - name: Check for Command + id: command + uses: xt0rted/slash-command-action@v2 + with: + command: bench + reaction-type: "rocket" + repo-token: ${{ env.GH_TOKEN }} + + - uses: xt0rted/pull-request-comment-branch@v2 + id: comment-branch + with: + repo_token: ${{ env.GH_TOKEN }} + + - uses: actions/checkout@v3 + if: success() + with: + fetch-depth: 0 # fetch full history to be able to get main commit sha + ref: ${{ steps.comment-branch.outputs.head_ref }} + + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run benchmarks on PR ${{ github.event.issue.id }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.url }}) on [#${{github.event.issue.id}}](${{ github.event.issue.url }})" -- ${{ steps.command.outputs.command-arguments }} \ No newline at end of file diff --git a/.github/workflows/bench-push-indexing.yml b/.github/workflows/bench-push-indexing.yml new file mode 100644 index 000000000..fd0f19a5a --- /dev/null +++ b/.github/workflows/bench-push-indexing.yml @@ -0,0 +1,25 @@ +name: Indexing bench (push) + +on: + push: + branches: + - main + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json + From 25f64ce7df90befde69fadc9a031c4d0418ad8ed Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 11:05:20 +0100 Subject: [PATCH 45/52] Replace logging timer by spans --- Cargo.lock | 23 ------------------- milli/Cargo.toml | 6 ++--- milli/src/search/new/bucket_sort.rs | 2 +- milli/src/search/new/mod.rs | 12 ++++++++-- .../src/search/new/query_term/parse_query.rs | 2 +- 5 files changed, 15 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 971ab602a..3c7d28055 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3023,28 +3023,6 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "logging_timer" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64e96f261d684b7089aa576bb74e823241dccd994b27d30fabf1dcb3af284fe9" -dependencies = [ - "log", - "logging_timer_proc_macros", -] - -[[package]] -name = "logging_timer_proc_macros" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a9062912d7952c5588cc474795e0b9ee008e7e6781127945b85413d4b99d81" -dependencies = [ - "log", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "lz4_flex" version = "0.10.0" @@ -3316,7 +3294,6 @@ dependencies = [ "json-depth-checker", "levenshtein_automata", "liquid", - "logging_timer", "maplit", "md5", "meili-snap", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7e45168ed..1dfa495ea 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -70,13 +70,13 @@ itertools = "0.11.0" # profiling puffin = "0.16.0" -# logging -logging_timer = "1.1.0" csv = "1.3.0" candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] } +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = [ + "onig", +] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ "online", ] } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index e7bafaf70..02528e378 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -15,7 +15,7 @@ pub struct BucketSortOutput { // TODO: would probably be good to regroup some of these inside of a struct? #[allow(clippy::too_many_arguments)] -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")] pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7b3b1d5b2..ae661e3f6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -191,7 +191,7 @@ fn resolve_maximally_reduced_query_graph( Ok(docids) } -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search")] fn resolve_universe( ctx: &mut SearchContext, initial_universe: &RoaringBitmap, @@ -557,7 +557,7 @@ pub fn execute_vector_search( } #[allow(clippy::too_many_arguments)] -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn execute_search( ctx: &mut SearchContext, query: Option<&str>, @@ -577,6 +577,9 @@ pub fn execute_search( let mut located_query_terms = None; let query_terms = if let Some(query) = query { + let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder"); + let entered = span.enter(); + // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -605,7 +608,12 @@ pub fn execute_search( } let tokenizer = tokbuilder.build(); + drop(entered); + + let span = tracing::trace_span!(target: "search::tokens", "tokenize"); + let entered = span.enter(); let tokens = tokenizer.tokenize(query); + drop(entered); let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; if query_terms.is_empty() { diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 8ab93ed3b..ea997a41a 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -9,7 +9,7 @@ use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. -#[logging_timer::time] +#[tracing::instrument(level = "trace", skip_all, target = "search::query")] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, From b1309179339745355d98cdd255327a58e4eb6733 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 28 Feb 2024 15:53:01 +0100 Subject: [PATCH 46/52] add the content type in the webhook + improve the test --- index-scheduler/src/lib.rs | 4 +++- meilisearch/tests/tasks/webhook.rs | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 9544004cc..adb3d4942 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1395,7 +1395,9 @@ impl IndexScheduler { // let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default()); - let request = ureq::post(url).set("Content-Encoding", "gzip"); + let request = ureq::post(url) + .set("Content-Encoding", "gzip") + .set("Content-Type", "application/x-ndjson"); let request = match &self.webhook_authorization_header { Some(header) => request.set("Authorization", header), None => request, diff --git a/meilisearch/tests/tasks/webhook.rs b/meilisearch/tests/tasks/webhook.rs index a18a93edb..b01ef3d5a 100644 --- a/meilisearch/tests/tasks/webhook.rs +++ b/meilisearch/tests/tasks/webhook.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use actix_http::body::MessageBody; use actix_web::dev::{ServiceFactory, ServiceResponse}; use actix_web::web::{Bytes, Data}; -use actix_web::{post, App, HttpResponse, HttpServer}; +use actix_web::{post, App, HttpRequest, HttpResponse, HttpServer}; use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tokio::sync::mpsc; @@ -17,7 +17,17 @@ use crate::common::{default_settings, Server}; use crate::json; #[post("/")] -async fn forward_body(sender: Data>>, body: Bytes) -> HttpResponse { +async fn forward_body( + req: HttpRequest, + sender: Data>>, + body: Bytes, +) -> HttpResponse { + let headers = req.headers(); + assert_eq!(headers.get("content-type").unwrap(), "application/x-ndjson"); + assert_eq!(headers.get("transfer-encoding").unwrap(), "chunked"); + assert_eq!(headers.get("accept-encoding").unwrap(), "gzip"); + assert_eq!(headers.get("content-encoding").unwrap(), "gzip"); + let body = body.to_vec(); sender.send(body).unwrap(); HttpResponse::Ok().into() From 36d17110d8f50f294734c5fcf9f226227ee8d441 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 12:18:54 +0100 Subject: [PATCH 47/52] openai: Handle BAD_GETAWAY, be more resilient to failure --- milli/src/vector/error.rs | 6 +++--- milli/src/vector/openai.rs | 26 +++++++------------------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 3673c85e3..fbe4ee878 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -59,8 +59,8 @@ pub enum EmbedErrorKind { OpenAiAuth(OpenAiError), #[error("sent too many requests to OpenAI: {0}")] OpenAiTooManyRequests(OpenAiError), - #[error("received internal error from OpenAI: {0}")] - OpenAiInternalServerError(OpenAiError), + #[error("received internal error from OpenAI: {0:?}")] + OpenAiInternalServerError(Option), #[error("sent too many tokens in a request to OpenAI: {0}")] OpenAiTooManyTokens(OpenAiError), #[error("received unhandled HTTP status code {0} from OpenAI")] @@ -106,7 +106,7 @@ impl EmbedError { Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime } } - pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError { + pub(crate) fn openai_internal_server_error(inner: Option) -> EmbedError { Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime } } diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index cbddddfb7..cfc4b6e83 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -220,24 +220,12 @@ impl Embedder { error_response.error, ))); } - StatusCode::INTERNAL_SERVER_ERROR => { - let error_response: OpenAiErrorResponse = response - .json() - .await - .map_err(EmbedError::openai_unexpected) - .map_err(Retry::retry_later)?; + StatusCode::INTERNAL_SERVER_ERROR + | StatusCode::BAD_GATEWAY + | StatusCode::SERVICE_UNAVAILABLE => { + let error_response: Result = response.json().await; return Err(Retry::retry_later(EmbedError::openai_internal_server_error( - error_response.error, - ))); - } - StatusCode::SERVICE_UNAVAILABLE => { - let error_response: OpenAiErrorResponse = response - .json() - .await - .map_err(EmbedError::openai_unexpected) - .map_err(Retry::retry_later)?; - return Err(Retry::retry_later(EmbedError::openai_internal_server_error( - error_response.error, + error_response.ok().map(|error_response| error_response.error), ))); } StatusCode::BAD_REQUEST => { @@ -248,14 +236,14 @@ impl Embedder { .map_err(EmbedError::openai_unexpected) .map_err(Retry::retry_later)?; - tracing::warn!("OpenAI: input was too long, retrying on tokenized version. For best performance, limit the size of your prompt."); + tracing::warn!("OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your prompt."); return Err(Retry::retry_tokenized(EmbedError::openai_too_many_tokens( error_response.error, ))); } code => { - return Err(Retry::give_up(EmbedError::openai_unhandled_status_code( + return Err(Retry::retry_later(EmbedError::openai_unhandled_status_code( code.as_u16(), ))); } From 0c216048b53ecbf4fe2cb63f1a03786aed7dadab Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 12:19:25 +0100 Subject: [PATCH 48/52] Cap timeout duration --- milli/src/vector/openai.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index cfc4b6e83..33442dda4 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -178,6 +178,8 @@ impl Embedder { retry.into_duration(attempt) } }?; + + let retry_duration = retry_duration.min(std::time::Duration::from_secs(60)); // don't wait more than a minute tracing::warn!( "Attempt #{}, retrying after {}ms.", attempt, From 7ee20b0895f902275c727784b4b098e238e16abf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 14:42:06 +0100 Subject: [PATCH 49/52] Refactor xtask bench --- xtask/src/bench/assets.rs | 250 ++++++++ xtask/src/bench/client.rs | 80 +++ xtask/src/bench/command.rs | 194 ++++++ xtask/src/bench/dashboard.rs | 167 +++++ xtask/src/bench/meili_process.rs | 112 ++++ xtask/src/bench/mod.rs | 1014 +----------------------------- xtask/src/bench/workload.rs | 262 ++++++++ 7 files changed, 1094 insertions(+), 985 deletions(-) create mode 100644 xtask/src/bench/assets.rs create mode 100644 xtask/src/bench/client.rs create mode 100644 xtask/src/bench/command.rs create mode 100644 xtask/src/bench/dashboard.rs create mode 100644 xtask/src/bench/meili_process.rs create mode 100644 xtask/src/bench/workload.rs diff --git a/xtask/src/bench/assets.rs b/xtask/src/bench/assets.rs new file mode 100644 index 000000000..241928dbf --- /dev/null +++ b/xtask/src/bench/assets.rs @@ -0,0 +1,250 @@ +use std::collections::BTreeMap; +use std::io::{Read as _, Seek as _, Write as _}; + +use anyhow::{bail, Context}; +use futures_util::TryStreamExt as _; +use serde::Deserialize; +use sha2::Digest; + +use super::client::Client; + +#[derive(Deserialize, Clone)] +pub struct Asset { + pub local_location: Option, + pub remote_location: Option, + #[serde(default)] + pub format: AssetFormat, + pub sha256: Option, +} + +#[derive(Deserialize, Default, Copy, Clone)] +pub enum AssetFormat { + #[default] + Auto, + Json, + NdJson, + Raw, +} + +impl AssetFormat { + pub fn to_content_type(self, filename: &str) -> &'static str { + match self { + AssetFormat::Auto => Self::auto_detect(filename).to_content_type(filename), + AssetFormat::Json => "application/json", + AssetFormat::NdJson => "application/x-ndjson", + AssetFormat::Raw => "application/octet-stream", + } + } + + fn auto_detect(filename: &str) -> Self { + let path = std::path::Path::new(filename); + match path.extension().and_then(|extension| extension.to_str()) { + Some(extension) if extension.eq_ignore_ascii_case("json") => Self::Json, + Some(extension) if extension.eq_ignore_ascii_case("ndjson") => Self::NdJson, + extension => { + tracing::warn!(asset = filename, ?extension, "asset has format `Auto`, but extension was not recognized. Specify `Raw` format to suppress this warning."); + AssetFormat::Raw + } + } + } +} + +pub fn fetch_asset( + name: &str, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<(std::fs::File, AssetFormat)> { + let asset = + assets.get(name).with_context(|| format!("could not find asset with name '{name}'"))?; + let filename = if let Some(local_filename) = &asset.local_location { + local_filename.clone() + } else { + format!("{asset_folder}/{name}") + }; + + Ok(( + std::fs::File::open(&filename) + .with_context(|| format!("could not open asset '{name}' at '{filename}'"))?, + asset.format, + )) +} + +#[tracing::instrument(skip(client, assets), fields(asset_count = assets.len()))] +pub async fn fetch_assets( + client: &Client, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let mut download_tasks = tokio::task::JoinSet::new(); + for (name, asset) in assets { + // trying local + if let Some(local) = &asset.local_location { + match std::fs::File::open(local) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = local, "found local resource for asset but hash differed, skipping to asset store"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking local resource, skipping to asset store" + ), + }, + } + } + + // checking asset store + let store_filename = format!("{}/{}", asset_folder, name); + + match std::fs::File::open(&store_filename) { + Ok(file) => { + if check_sha256(name, asset, file)? { + continue; + } else { + tracing::warn!(asset = name, file = store_filename, "found resource for asset in asset store, but hash differed, skipping to remote method"); + } + } + Err(error) => match error.kind() { + std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ + } + _ => tracing::warn!( + error = &error as &dyn std::error::Error, + "error checking resource in store, skipping to remote method" + ), + }, + } + + // downloading remote + match &asset.remote_location { + Some(location) => { + std::fs::create_dir_all(asset_folder).with_context(|| format!("could not create asset folder at {asset_folder}"))?; + download_tasks.spawn({ + let client = client.clone(); + let name = name.to_string(); + let location = location.to_string(); + let store_filename = store_filename.clone(); + let asset = asset.clone(); + download_asset(client, name, asset, location, store_filename)}); + }, + None => bail!("asset {name} has no remote location, but was not found locally or in the asset store"), + } + } + + while let Some(res) = download_tasks.join_next().await { + res.context("download task panicked")?.context("download task failed")?; + } + + Ok(()) +} + +fn check_sha256(name: &str, asset: &Asset, mut file: std::fs::File) -> anyhow::Result { + let mut bytes = Vec::new(); + file.read_to_end(&mut bytes).with_context(|| format!("hashing file for asset {name}"))?; + let mut file_hash = sha2::Sha256::new(); + file_hash.update(&bytes); + let file_hash = file_hash.finalize(); + let file_hash = format!("{:x}", file_hash); + tracing::debug!(hash = file_hash, "hashed local file"); + + Ok(match &asset.sha256 { + Some(hash) => { + tracing::debug!(hash, "hash from workload"); + if hash.to_ascii_lowercase() == file_hash { + true + } else { + tracing::warn!( + file_hash, + asset_hash = hash.to_ascii_lowercase(), + "hashes don't match" + ); + false + } + } + None => { + tracing::warn!(sha256 = file_hash, "Skipping hash for asset {name} that doesn't have one. Please add it to workload file"); + true + } + }) +} + +#[tracing::instrument(skip(client, asset, name), fields(asset = name))] +async fn download_asset( + client: Client, + name: String, + asset: Asset, + src: String, + dest_filename: String, +) -> anyhow::Result<()> { + let context = || format!("failure downloading asset {name} from {src}"); + + let response = client.get(&src).send().await.with_context(context)?; + + let file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&dest_filename) + .with_context(|| format!("creating destination file {dest_filename}")) + .with_context(context)?; + + let mut dest = std::io::BufWriter::new( + file.try_clone().context("cloning I/O handle").with_context(context)?, + ); + + let total_len: Option = response + .headers() + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse().ok()); + + let progress = tokio::spawn({ + let name = name.clone(); + async move { + loop { + match file.metadata().context("could not get file metadata") { + Ok(metadata) => { + let len = metadata.len(); + tracing::info!( + asset = name, + downloaded_bytes = len, + total_bytes = total_len, + "asset download in progress" + ); + } + Err(error) => { + tracing::warn!(%error, "could not get file metadata"); + } + } + tokio::time::sleep(std::time::Duration::from_secs(60)).await; + } + } + }); + + let writing_context = || format!("while writing to destination file at {dest_filename}"); + + let mut response = response.bytes_stream(); + + while let Some(bytes) = + response.try_next().await.context("while downloading file").with_context(context)? + { + dest.write_all(&bytes).with_context(writing_context).with_context(context)?; + } + + progress.abort(); + + let mut file = dest.into_inner().with_context(writing_context).with_context(context)?; + + file.rewind().context("while rewinding asset file")?; + + if !check_sha256(&name, &asset, file)? { + bail!("asset '{name}': sha256 mismatch for file {dest_filename} downloaded from {src}") + } + + Ok(()) +} diff --git a/xtask/src/bench/client.rs b/xtask/src/bench/client.rs new file mode 100644 index 000000000..3e46615cc --- /dev/null +++ b/xtask/src/bench/client.rs @@ -0,0 +1,80 @@ +use anyhow::Context; +use serde::Deserialize; + +#[derive(Debug, Clone)] +pub struct Client { + base_url: Option, + client: reqwest::Client, +} + +impl Client { + pub fn new( + base_url: Option, + api_key: Option<&str>, + timeout: Option, + ) -> anyhow::Result { + let mut headers = reqwest::header::HeaderMap::new(); + if let Some(api_key) = api_key { + headers.append( + reqwest::header::AUTHORIZATION, + reqwest::header::HeaderValue::from_str(&format!("Bearer {api_key}")) + .context("Invalid authorization header")?, + ); + } + + let client = reqwest::ClientBuilder::new().default_headers(headers); + let client = if let Some(timeout) = timeout { client.timeout(timeout) } else { client }; + let client = client.build()?; + Ok(Self { base_url, client }) + } + + pub fn request(&self, method: reqwest::Method, route: &str) -> reqwest::RequestBuilder { + if let Some(base_url) = &self.base_url { + if route.is_empty() { + self.client.request(method, base_url) + } else { + self.client.request(method, format!("{}/{}", base_url, route)) + } + } else { + self.client.request(method, route) + } + } + + pub fn get(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::GET, route) + } + + pub fn put(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::PUT, route) + } + + pub fn post(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::POST, route) + } + + pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { + self.request(reqwest::Method::DELETE, route) + } +} + +#[derive(Debug, Clone, Copy, Deserialize)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum Method { + Get, + Post, + Patch, + Delete, + Put, +} + +impl From for reqwest::Method { + fn from(value: Method) -> Self { + match value { + Method::Get => Self::GET, + Method::Post => Self::POST, + Method::Patch => Self::PATCH, + Method::Delete => Self::DELETE, + Method::Put => Self::PUT, + } + } +} diff --git a/xtask/src/bench/command.rs b/xtask/src/bench/command.rs new file mode 100644 index 000000000..0f0b5d213 --- /dev/null +++ b/xtask/src/bench/command.rs @@ -0,0 +1,194 @@ +use std::collections::BTreeMap; +use std::fmt::Display; +use std::io::Read as _; + +use anyhow::{bail, Context as _}; +use serde::Deserialize; + +use super::assets::{fetch_asset, Asset}; +use super::client::{Client, Method}; + +#[derive(Clone, Deserialize)] +pub struct Command { + pub route: String, + pub method: Method, + #[serde(default)] + pub body: Body, + #[serde(default)] + pub synchronous: SyncMode, +} + +#[derive(Default, Clone, Deserialize)] +#[serde(untagged)] +pub enum Body { + Inline { + inline: serde_json::Value, + }, + Asset { + asset: String, + }, + #[default] + Empty, +} + +impl Body { + pub fn get( + self, + assets: &BTreeMap, + asset_folder: &str, + ) -> anyhow::Result, &'static str)>> { + Ok(match self { + Body::Inline { inline: body } => Some(( + serde_json::to_vec(&body) + .context("serializing to bytes") + .context("while getting inline body")?, + "application/json", + )), + Body::Asset { asset: name } => Some({ + let context = || format!("while getting body from asset '{name}'"); + let (mut file, format) = + fetch_asset(&name, assets, asset_folder).with_context(context)?; + let mut buf = Vec::new(); + file.read_to_end(&mut buf).with_context(context)?; + (buf, format.to_content_type(&name)) + }), + Body::Empty => None, + }) + } +} + +impl Display for Command { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?} {} ({:?})", self.method, self.route, self.synchronous) + } +} + +#[derive(Default, Debug, Clone, Copy, Deserialize)] +pub enum SyncMode { + DontWait, + #[default] + WaitForResponse, + WaitForTask, +} + +pub async fn run_batch( + client: &Client, + batch: &[Command], + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + let [.., last] = batch else { return Ok(()) }; + let sync = last.synchronous; + + let mut tasks = tokio::task::JoinSet::new(); + + for command in batch { + // FIXME: you probably don't want to copy assets everytime here + tasks.spawn({ + let client = client.clone(); + let command = command.clone(); + let assets = assets.clone(); + let asset_folder = asset_folder.to_owned(); + + async move { run(client, command, &assets, &asset_folder).await } + }); + } + + while let Some(result) = tasks.join_next().await { + result + .context("panicked while executing command")? + .context("error while executing command")?; + } + + match sync { + SyncMode::DontWait => {} + SyncMode::WaitForResponse => {} + SyncMode::WaitForTask => wait_for_tasks(client).await?, + } + + Ok(()) +} + +async fn wait_for_tasks(client: &Client) -> anyhow::Result<()> { + loop { + let response = client + .get("tasks?statuses=enqueued,processing") + .send() + .await + .context("could not wait for tasks")?; + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response to JSON") + .context("could not wait for tasks")?; + match response.get("total") { + Some(serde_json::Value::Number(number)) => { + let number = number.as_u64().with_context(|| { + format!("waiting for tasks: could not parse 'total' as integer, got {}", number) + })?; + if number == 0 { + break; + } else { + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + continue; + } + } + Some(thing_else) => { + bail!(format!( + "waiting for tasks: could not parse 'total' as a number, got '{thing_else}'" + )) + } + None => { + bail!(format!( + "waiting for tasks: expected response to contain 'total', got '{response}'" + )) + } + } + } + Ok(()) +} + +#[tracing::instrument(skip(client, command, assets, asset_folder), fields(command = %command))] +pub async fn run( + client: Client, + mut command: Command, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + // memtake the body here to leave an empty body in its place, so that command is not partially moved-out + let body = std::mem::take(&mut command.body) + .get(assets, asset_folder) + .with_context(|| format!("while getting body for command {command}"))?; + + let request = client.request(command.method.into(), &command.route); + + let request = if let Some((body, content_type)) = body { + request.body(body).header(reqwest::header::CONTENT_TYPE, content_type) + } else { + request + }; + + let response = + request.send().await.with_context(|| format!("error sending command: {}", command))?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%command, %code, "error in workload file"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing error in workload file when sending command")?; + bail!("error in workload file: server responded with error code {code} and '{response}'") + } else if code.is_server_error() { + tracing::error!(%command, %code, "server error"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("parsing server error when sending command")?; + bail!("server error: server responded with error code {code} and '{response}'") + } + + Ok(()) +} diff --git a/xtask/src/bench/dashboard.rs b/xtask/src/bench/dashboard.rs new file mode 100644 index 000000000..833426207 --- /dev/null +++ b/xtask/src/bench/dashboard.rs @@ -0,0 +1,167 @@ +use std::collections::BTreeMap; + +use anyhow::{bail, Context}; +use serde_json::json; +use tokio::signal::ctrl_c; +use tokio::task::AbortHandle; +use tracing_trace::processor::span_stats::CallStats; +use uuid::Uuid; + +use super::client::Client; +use super::env_info; +use super::workload::Workload; + +pub async fn cancel_on_ctrl_c( + invocation_uuid: Uuid, + dashboard_client: Client, + abort_handle: AbortHandle, +) { + tracing::info!("press Ctrl-C to cancel the invocation"); + match ctrl_c().await { + Ok(()) => { + tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); + mark_as_failed(dashboard_client, invocation_uuid, None).await; + abort_handle.abort(); + } + Err(error) => tracing::warn!( + error = &error as &dyn std::error::Error, + "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" + ), + } +} + +pub async fn mark_as_failed( + dashboard_client: Client, + invocation_uuid: Uuid, + failure_reason: Option, +) { + let response = dashboard_client + .post("cancel-invocation") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "failure_reason": failure_reason, + })) + .send() + .await; + let response = match response { + Ok(response) => response, + Err(response_error) => { + tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); + return; + } + }; + + if !response.status().is_success() { + tracing::error!( + %invocation_uuid, + "could not mark invocation as failed: {}", + response.text().await.unwrap() + ); + return; + } + tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); +} + +pub async fn send_machine_info( + dashboard_client: &Client, + env: &env_info::Environment, +) -> anyhow::Result<()> { + let response = dashboard_client + .put("machine") + .json(&json!({"hostname": env.hostname})) + .send() + .await + .context("sending machine information")?; + if !response.status().is_success() { + bail!( + "could not send machine information: {} {}", + response.status(), + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + Ok(()) +} + +pub async fn create_invocation( + dashboard_client: &Client, + build_info: build_info::BuildInfo, + commit_message: &str, + env: env_info::Environment, + max_workloads: usize, + reason: Option<&str>, +) -> anyhow::Result { + let response = dashboard_client + .put("invocation") + .json(&json!({ + "commit": { + "sha1": build_info.commit_sha1, + "message": commit_message, + "commit_date": build_info.commit_timestamp, + "branch": build_info.branch, + "tag": build_info.describe.and_then(|describe| describe.as_tag()), + }, + "machine_hostname": env.hostname, + "max_workloads": max_workloads, + "reason": reason + })) + .send() + .await + .context("sending invocation")?; + if !response.status().is_success() { + bail!( + "could not send new invocation: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ); + } + let invocation_uuid: Uuid = + response.json().await.context("could not deserialize invocation response as JSON")?; + Ok(invocation_uuid) +} + +pub async fn create_workload( + dashboard_client: &Client, + invocation_uuid: Uuid, + workload: &Workload, +) -> anyhow::Result { + let response = dashboard_client + .put("workload") + .json(&json!({ + "invocation_uuid": invocation_uuid, + "name": &workload.name, + "max_runs": workload.run_count, + })) + .send() + .await + .context("could not create new workload")?; + + if !response.status().is_success() { + bail!("creating new workload failed: {}", response.text().await.unwrap()) + } + + let workload_uuid: Uuid = + response.json().await.context("could not deserialize JSON as UUID")?; + Ok(workload_uuid) +} + +pub async fn create_run( + dashboard_client: Client, + workload_uuid: Uuid, + report: &BTreeMap, +) -> anyhow::Result<()> { + let response = dashboard_client + .put("run") + .json(&json!({ + "workload_uuid": workload_uuid, + "data": report + })) + .send() + .await + .context("sending new run")?; + if !response.status().is_success() { + bail!( + "sending new run failed: {}", + response.text().await.unwrap_or_else(|_| "unknown".into()) + ) + } + Ok(()) +} diff --git a/xtask/src/bench/meili_process.rs b/xtask/src/bench/meili_process.rs new file mode 100644 index 000000000..99f6f4ea6 --- /dev/null +++ b/xtask/src/bench/meili_process.rs @@ -0,0 +1,112 @@ +use std::collections::BTreeMap; + +use anyhow::{bail, Context as _}; + +use super::assets::Asset; +use super::client::Client; +use super::workload::Workload; + +pub async fn kill(mut meilisearch: tokio::process::Child) { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } +} + +#[tracing::instrument] +pub async fn build() -> anyhow::Result<()> { + let mut command = tokio::process::Command::new("cargo"); + command.arg("build").arg("--release").arg("-p").arg("meilisearch"); + + command.kill_on_drop(true); + + let mut builder = command.spawn().context("error building Meilisearch")?; + + if !builder.wait().await.context("could not build Meilisearch")?.success() { + bail!("failed building Meilisearch") + } + + Ok(()) +} + +#[tracing::instrument(skip(client, master_key, workload), fields(workload = workload.name))] +pub async fn start( + client: &Client, + master_key: Option<&str>, + workload: &Workload, + asset_folder: &str, +) -> anyhow::Result { + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + + command.arg("--db-path").arg("./_xtask_benchmark.ms"); + if let Some(master_key) = master_key { + command.arg("--master-key").arg(master_key); + } + command.arg("--experimental-enable-logs-route"); + + for extra_arg in workload.extra_cli_args.iter() { + command.arg(extra_arg); + } + + command.kill_on_drop(true); + + let mut meilisearch = command.spawn().context("Error starting Meilisearch")?; + + wait_for_health(client, &mut meilisearch, &workload.assets, asset_folder).await?; + + Ok(meilisearch) +} + +async fn wait_for_health( + client: &Client, + meilisearch: &mut tokio::process::Child, + assets: &BTreeMap, + asset_folder: &str, +) -> anyhow::Result<()> { + for i in 0..100 { + let res = super::command::run(client.clone(), health_command(), assets, asset_folder).await; + if res.is_ok() { + // check that this is actually the current Meilisearch instance that answered us + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + tracing::error!("Got an health response from a different process"); + bail!("Meilisearch server exited early with code {exit_code}"); + } + + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + // check whether the Meilisearch instance exited early (cut the wait) + if let Some(exit_code) = + meilisearch.try_wait().context("cannot check Meilisearch server process status")? + { + bail!("Meilisearch server exited early with code {exit_code}"); + } + tracing::debug!(attempt = i, "Waiting for Meilisearch to go up"); + } + bail!("meilisearch is not responding") +} + +fn health_command() -> super::command::Command { + super::command::Command { + route: "/health".into(), + method: super::client::Method::Get, + body: Default::default(), + synchronous: super::command::SyncMode::WaitForResponse, + } +} + +pub fn delete_db() { + let _ = std::fs::remove_dir_all("./_xtask_benchmark.ms"); +} diff --git a/xtask/src/bench/mod.rs b/xtask/src/bench/mod.rs index cfc7c124f..62c11b604 100644 --- a/xtask/src/bench/mod.rs +++ b/xtask/src/bench/mod.rs @@ -1,20 +1,21 @@ +mod assets; +mod client; +mod command; +mod dashboard; mod env_info; +mod meili_process; +mod workload; -use std::collections::BTreeMap; -use std::fmt::Display; -use std::io::{Read, Seek, Write}; use std::path::PathBuf; -use anyhow::{bail, Context}; +use anyhow::Context; use clap::Parser; -use futures_util::TryStreamExt; -use serde::Deserialize; -use serde_json::json; -use sha2::Digest; use tracing_subscriber::fmt::format::FmtSpan; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; -use uuid::Uuid; + +use self::client::Client; +use self::workload::Workload; pub fn default_http_addr() -> String { "127.0.0.1:7700".to_string() @@ -35,62 +36,6 @@ pub fn default_dashboard_url() -> String { "http://localhost:9001".into() } -#[derive(Debug, Clone)] -pub struct Client { - base_url: Option, - client: reqwest::Client, -} - -impl Client { - pub fn new( - base_url: Option, - api_key: Option<&str>, - timeout: Option, - ) -> anyhow::Result { - let mut headers = reqwest::header::HeaderMap::new(); - if let Some(api_key) = api_key { - headers.append( - reqwest::header::AUTHORIZATION, - reqwest::header::HeaderValue::from_str(&format!("Bearer {api_key}")) - .context("Invalid authorization header")?, - ); - } - - let client = reqwest::ClientBuilder::new().default_headers(headers); - let client = if let Some(timeout) = timeout { client.timeout(timeout) } else { client }; - let client = client.build()?; - Ok(Self { base_url, client }) - } - - pub fn request(&self, method: reqwest::Method, route: &str) -> reqwest::RequestBuilder { - if let Some(base_url) = &self.base_url { - if route.is_empty() { - self.client.request(method, base_url) - } else { - self.client.request(method, format!("{}/{}", base_url, route)) - } - } else { - self.client.request(method, route) - } - } - - pub fn get(&self, route: &str) -> reqwest::RequestBuilder { - self.request(reqwest::Method::GET, route) - } - - pub fn put(&self, route: &str) -> reqwest::RequestBuilder { - self.request(reqwest::Method::PUT, route) - } - - pub fn post(&self, route: &str) -> reqwest::RequestBuilder { - self.request(reqwest::Method::POST, route) - } - - pub fn delete(&self, route: &str) -> reqwest::RequestBuilder { - self.request(reqwest::Method::DELETE, route) - } -} - /// Run benchmarks from a workload #[derive(Parser, Debug)] pub struct BenchDeriveArgs { @@ -134,166 +79,11 @@ pub struct BenchDeriveArgs { reason: Option, } -#[derive(Deserialize)] -pub struct Workload { - pub name: String, - pub run_count: u16, - pub extra_cli_args: Vec, - pub assets: BTreeMap, - pub commands: Vec, -} - -#[derive(Deserialize, Clone)] -pub struct Asset { - pub local_location: Option, - pub remote_location: Option, - #[serde(default)] - pub format: AssetFormat, - pub sha256: Option, -} - -#[derive(Deserialize, Default, Copy, Clone)] -pub enum AssetFormat { - #[default] - Auto, - Json, - NdJson, - Raw, -} -impl AssetFormat { - fn to_content_type(self, filename: &str) -> &'static str { - match self { - AssetFormat::Auto => Self::auto_detect(filename).to_content_type(filename), - AssetFormat::Json => "application/json", - AssetFormat::NdJson => "application/x-ndjson", - AssetFormat::Raw => "application/octet-stream", - } - } - - fn auto_detect(filename: &str) -> Self { - let path = std::path::Path::new(filename); - match path.extension().and_then(|extension| extension.to_str()) { - Some(extension) if extension.eq_ignore_ascii_case("json") => Self::Json, - Some(extension) if extension.eq_ignore_ascii_case("ndjson") => Self::NdJson, - extension => { - tracing::warn!(asset = filename, ?extension, "asset has format `Auto`, but extension was not recognized. Specify `Raw` format to suppress this warning."); - AssetFormat::Raw - } - } - } -} - -#[derive(Clone, Deserialize)] -pub struct Command { - pub route: String, - pub method: Method, - #[serde(default)] - pub body: Body, - #[serde(default)] - pub synchronous: SyncMode, -} - -#[derive(Default, Clone, Deserialize)] -#[serde(untagged)] -pub enum Body { - Inline { - inline: serde_json::Value, - }, - Asset { - asset: String, - }, - #[default] - Empty, -} - -impl Body { - pub fn get( - self, - assets: &BTreeMap, - asset_folder: &str, - ) -> anyhow::Result, &'static str)>> { - Ok(match self { - Body::Inline { inline: body } => Some(( - serde_json::to_vec(&body) - .context("serializing to bytes") - .context("while getting inline body")?, - "application/json", - )), - Body::Asset { asset: name } => Some({ - let context = || format!("while getting body from asset '{name}'"); - let (mut file, format) = - fetch_asset(&name, assets, asset_folder).with_context(context)?; - let mut buf = Vec::new(); - file.read_to_end(&mut buf).with_context(context)?; - (buf, format.to_content_type(&name)) - }), - Body::Empty => None, - }) - } -} - -fn fetch_asset( - name: &str, - assets: &BTreeMap, - asset_folder: &str, -) -> anyhow::Result<(std::fs::File, AssetFormat)> { - let asset = - assets.get(name).with_context(|| format!("could not find asset with name '{name}'"))?; - let filename = if let Some(local_filename) = &asset.local_location { - local_filename.clone() - } else { - format!("{asset_folder}/{name}") - }; - - Ok(( - std::fs::File::open(&filename) - .with_context(|| format!("could not open asset '{name}' at '{filename}'"))?, - asset.format, - )) -} - -impl Display for Command { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?} {} ({:?})", self.method, self.route, self.synchronous) - } -} - -#[derive(Debug, Clone, Copy, Deserialize)] -pub enum Method { - GET, - POST, - PATCH, - DELETE, - PUT, -} - -impl From for reqwest::Method { - fn from(value: Method) -> Self { - match value { - Method::GET => Self::GET, - Method::POST => Self::POST, - Method::PATCH => Self::PATCH, - Method::DELETE => Self::DELETE, - Method::PUT => Self::PUT, - } - } -} - -#[derive(Default, Debug, Clone, Copy, Deserialize)] -pub enum SyncMode { - DontWait, - #[default] - WaitForResponse, - WaitForTask, -} - pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { + // setup logs let filter: tracing_subscriber::filter::Targets = args.log_filter.parse().context("invalid --log-filter")?; - let env = env_info::Environment::generate_from_current_config(); - let build_info = build_info::BuildInfo::from_build(); - let subscriber = tracing_subscriber::registry().with( tracing_subscriber::fmt::layer() .with_span_events(FmtSpan::NEW | FmtSpan::CLOSE) @@ -301,9 +91,15 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { ); tracing::subscriber::set_global_default(subscriber).context("could not setup logging")?; + // fetch environment and build info + let env = env_info::Environment::generate_from_current_config(); + let build_info = build_info::BuildInfo::from_build(); + + // tokio runtime let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?; let _scope = rt.enter(); + // setup clients let assets_client = Client::new(None, args.assets_key.as_deref(), Some(std::time::Duration::from_secs(3600)))?; // 1h @@ -328,55 +124,19 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { Some(std::time::Duration::from_secs(60)), )?; + // enter runtime + rt.block_on(async { - let response = dashboard_client - .put("machine") - .json(&json!({"hostname": env.hostname})) - .send() - .await - .context("sending machine information")?; - if !response.status().is_success() { - bail!( - "could not send machine information: {} {}", - response.status(), - response.text().await.unwrap_or_else(|_| "unknown".into()) - ); - } + dashboard::send_machine_info(&dashboard_client, &env).await?; let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); - let response = dashboard_client - .put("invocation") - .json(&json!({ - "commit": { - "sha1": build_info.commit_sha1, - "message": commit_message, - "commit_date": build_info.commit_timestamp, - "branch": build_info.branch, - "tag": build_info.describe.and_then(|describe| describe.as_tag()), - }, - "machine_hostname": env.hostname, - "max_workloads": max_workloads, - "reason": reason - })) - .send() - .await - .context("sending invocation")?; - - if !response.status().is_success() { - bail!( - "could not send new invocation: {}", - response.text().await.unwrap_or_else(|_| "unknown".into()) - ); - } - - let invocation_uuid: Uuid = - response.json().await.context("could not deserialize invocation response as JSON")?; - - + let invocation_uuid = dashboard::create_invocation(&dashboard_client, build_info, commit_message, env, max_workloads, reason).await?; tracing::info!(workload_count = args.workload_file.len(), "handling workload files"); + + // main task let workload_runs = tokio::spawn( { let dashboard_client = dashboard_client.clone(); @@ -388,7 +148,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { ) .with_context(|| format!("error parsing {} as JSON", workload_file.display()))?; - run_workload( + workload::execute( &assets_client, &dashboard_client, &logs_client, @@ -403,26 +163,14 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { Ok::<(), anyhow::Error>(()) }}); + // handle ctrl-c let abort_handle = workload_runs.abort_handle(); - tokio::spawn({ let dashboard_client = dashboard_client.clone(); - async move { - tracing::info!("press Ctrl-C to cancel the invocation"); - match tokio::signal::ctrl_c().await { - Ok(()) => { - tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation"); - mark_as_failed(dashboard_client, invocation_uuid, None).await; - abort_handle.abort(); - } - Err(error) => tracing::warn!( - error = &error as &dyn std::error::Error, - "failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C" - ), - } - } + dashboard::cancel_on_ctrl_c(invocation_uuid, dashboard_client, abort_handle) }); + // wait for the end of the main task, handle result match workload_runs.await { Ok(Ok(_)) => { tracing::info!("Success"); @@ -430,7 +178,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { } Ok(Err(error)) => { tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard"); - mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await; + dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await; tracing::warn!(%invocation_uuid, "invocation marked as failed following error"); Err(error) }, @@ -438,7 +186,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { match join_error.try_into_panic() { Ok(panic) => { tracing::error!("invocation panicked, attempting to report the failure to dashboard"); - mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await; + dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await; std::panic::resume_unwind(panic) } Err(_) => { @@ -453,707 +201,3 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { Ok(()) } - -async fn mark_as_failed( - dashboard_client: Client, - invocation_uuid: Uuid, - failure_reason: Option, -) { - let response = dashboard_client - .post("cancel-invocation") - .json(&json!({ - "invocation_uuid": invocation_uuid, - "failure_reason": failure_reason, - })) - .send() - .await; - let response = match response { - Ok(response) => response, - Err(response_error) => { - tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed"); - return; - } - }; - - if !response.status().is_success() { - tracing::error!( - %invocation_uuid, - "could not mark invocation as failed: {}", - response.text().await.unwrap() - ); - return; - } - tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled"); -} - -#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner -#[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))] -async fn run_workload( - assets_client: &Client, - dashboard_client: &Client, - logs_client: &Client, - meili_client: &Client, - invocation_uuid: Uuid, - master_key: Option<&str>, - workload: Workload, - args: &BenchDeriveArgs, -) -> anyhow::Result<()> { - fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; - - let response = dashboard_client - .put("workload") - .json(&json!({ - "invocation_uuid": invocation_uuid, - "name": &workload.name, - "max_runs": workload.run_count, - })) - .send() - .await - .context("could not create new workload")?; - - if !response.status().is_success() { - bail!("creating new workload failed: {}", response.text().await.unwrap()) - } - - let workload_uuid: Uuid = - response.json().await.context("could not deserialize JSON as UUID")?; - - let mut tasks = Vec::new(); - - for i in 0..workload.run_count { - tasks.push( - run_workload_run( - dashboard_client, - logs_client, - meili_client, - workload_uuid, - master_key, - &workload, - args, - i, - ) - .await?, - ); - } - - let mut reports = Vec::with_capacity(workload.run_count as usize); - - for task in tasks { - reports.push( - task.await - .context("task panicked while processing report")? - .context("task failed while processing report")?, - ); - } - - tracing::info!(workload = workload.name, "Successful workload"); - - Ok(()) -} - -#[tracing::instrument(skip(client, assets), fields(asset_count = assets.len()))] -async fn fetch_assets( - client: &Client, - assets: &BTreeMap, - asset_folder: &str, -) -> anyhow::Result<()> { - let mut download_tasks = tokio::task::JoinSet::new(); - for (name, asset) in assets { - // trying local - if let Some(local) = &asset.local_location { - match std::fs::File::open(local) { - Ok(file) => { - if check_sha256(name, asset, file)? { - continue; - } else { - tracing::warn!(asset = name, file = local, "found local resource for asset but hash differed, skipping to asset store"); - } - } - Err(error) => match error.kind() { - std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ - } - _ => tracing::warn!( - error = &error as &dyn std::error::Error, - "error checking local resource, skipping to asset store" - ), - }, - } - } - - // checking asset store - let store_filename = format!("{}/{}", asset_folder, name); - - match std::fs::File::open(&store_filename) { - Ok(file) => { - if check_sha256(name, asset, file)? { - continue; - } else { - tracing::warn!(asset = name, file = store_filename, "found resource for asset in asset store, but hash differed, skipping to remote method"); - } - } - Err(error) => match error.kind() { - std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */ - } - _ => tracing::warn!( - error = &error as &dyn std::error::Error, - "error checking resource in store, skipping to remote method" - ), - }, - } - - // downloading remote - match &asset.remote_location { - Some(location) => { - std::fs::create_dir_all(asset_folder).with_context(|| format!("could not create asset folder at {asset_folder}"))?; - download_tasks.spawn({ - let client = client.clone(); - let name = name.to_string(); - let location = location.to_string(); - let store_filename = store_filename.clone(); - let asset = asset.clone(); - download_asset(client, name, asset, location, store_filename)}); - }, - None => bail!("asset {name} has no remote location, but was not found locally or in the asset store"), - } - } - - while let Some(res) = download_tasks.join_next().await { - res.context("download task panicked")?.context("download task failed")?; - } - - Ok(()) -} - -fn check_sha256(name: &str, asset: &Asset, mut file: std::fs::File) -> anyhow::Result { - let mut bytes = Vec::new(); - file.read_to_end(&mut bytes).with_context(|| format!("hashing file for asset {name}"))?; - let mut file_hash = sha2::Sha256::new(); - file_hash.update(&bytes); - let file_hash = file_hash.finalize(); - let file_hash = format!("{:x}", file_hash); - tracing::debug!(hash = file_hash, "hashed local file"); - - Ok(match &asset.sha256 { - Some(hash) => { - tracing::debug!(hash, "hash from workload"); - if hash.to_ascii_lowercase() == file_hash { - true - } else { - tracing::warn!( - file_hash, - asset_hash = hash.to_ascii_lowercase(), - "hashes don't match" - ); - false - } - } - None => { - tracing::warn!(sha256 = file_hash, "Skipping hash for asset {name} that doesn't have one. Please add it to workload file"); - true - } - }) -} - -#[tracing::instrument(skip(client, asset, name), fields(asset = name))] -async fn download_asset( - client: Client, - name: String, - asset: Asset, - src: String, - dest_filename: String, -) -> anyhow::Result<()> { - let context = || format!("failure downloading asset {name} from {src}"); - - let response = client.get(&src).send().await.with_context(context)?; - - let file = std::fs::File::options() - .create(true) - .truncate(true) - .write(true) - .read(true) - .open(&dest_filename) - .with_context(|| format!("creating destination file {dest_filename}")) - .with_context(context)?; - - let mut dest = std::io::BufWriter::new( - file.try_clone().context("cloning I/O handle").with_context(context)?, - ); - - let total_len: Option = response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|value| value.to_str().ok()) - .and_then(|value| value.parse().ok()); - - let progress = tokio::spawn({ - let name = name.clone(); - async move { - loop { - match file.metadata().context("could not get file metadata") { - Ok(metadata) => { - let len = metadata.len(); - tracing::info!( - asset = name, - downloaded_bytes = len, - total_bytes = total_len, - "asset download in progress" - ); - } - Err(error) => { - tracing::warn!(%error, "could not get file metadata"); - } - } - tokio::time::sleep(std::time::Duration::from_secs(60)).await; - } - } - }); - - let writing_context = || format!("while writing to destination file at {dest_filename}"); - - let mut response = response.bytes_stream(); - - while let Some(bytes) = - response.try_next().await.context("while downloading file").with_context(context)? - { - dest.write_all(&bytes).with_context(writing_context).with_context(context)?; - } - - progress.abort(); - - let mut file = dest.into_inner().with_context(writing_context).with_context(context)?; - - file.rewind().context("while rewinding asset file")?; - - if !check_sha256(&name, &asset, file)? { - bail!("asset '{name}': sha256 mismatch for file {dest_filename} downloaded from {src}") - } - - Ok(()) -} - -#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner -#[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))] -async fn run_workload_run( - dashboard_client: &Client, - logs_client: &Client, - meili_client: &Client, - workload_uuid: Uuid, - master_key: Option<&str>, - workload: &Workload, - args: &BenchDeriveArgs, - run_number: u16, -) -> anyhow::Result>> { - delete_db(); - build_meilisearch().await?; - let meilisearch = - start_meilisearch(meili_client, master_key, workload, &args.asset_folder).await?; - let processor = run_commands( - dashboard_client, - logs_client, - meili_client, - workload_uuid, - workload, - args, - run_number, - ) - .await?; - - kill_meilisearch(meilisearch).await; - - tracing::info!(run_number, "Successful run"); - - Ok(processor) -} - -async fn kill_meilisearch(mut meilisearch: tokio::process::Child) { - if let Err(error) = meilisearch.kill().await { - tracing::warn!( - error = &error as &dyn std::error::Error, - "while terminating Meilisearch server" - ) - } -} - -#[tracing::instrument] -async fn build_meilisearch() -> anyhow::Result<()> { - let mut command = tokio::process::Command::new("cargo"); - command.arg("build").arg("--release").arg("-p").arg("meilisearch"); - - command.kill_on_drop(true); - - let mut builder = command.spawn().context("error building Meilisearch")?; - - if !builder.wait().await.context("could not build Meilisearch")?.success() { - bail!("failed building Meilisearch") - } - - Ok(()) -} - -#[tracing::instrument(skip(client, master_key, workload), fields(workload = workload.name))] -async fn start_meilisearch( - client: &Client, - master_key: Option<&str>, - workload: &Workload, - asset_folder: &str, -) -> anyhow::Result { - let mut command = tokio::process::Command::new("cargo"); - command - .arg("run") - .arg("--release") - .arg("-p") - .arg("meilisearch") - .arg("--bin") - .arg("meilisearch") - .arg("--"); - - command.arg("--db-path").arg("./_xtask_benchmark.ms"); - if let Some(master_key) = master_key { - command.arg("--master-key").arg(master_key); - } - command.arg("--experimental-enable-logs-route"); - - for extra_arg in workload.extra_cli_args.iter() { - command.arg(extra_arg); - } - - command.kill_on_drop(true); - - let mut meilisearch = command.spawn().context("Error starting Meilisearch")?; - - wait_for_health(client, &mut meilisearch, &workload.assets, asset_folder).await?; - - Ok(meilisearch) -} - -async fn wait_for_health( - client: &Client, - meilisearch: &mut tokio::process::Child, - assets: &BTreeMap, - asset_folder: &str, -) -> anyhow::Result<()> { - for i in 0..100 { - let res = run_command(client.clone(), health_command(), assets, asset_folder).await; - if res.is_ok() { - // check that this is actually the current Meilisearch instance that answered us - if let Some(exit_code) = - meilisearch.try_wait().context("cannot check Meilisearch server process status")? - { - tracing::error!("Got an health response from a different process"); - bail!("Meilisearch server exited early with code {exit_code}"); - } - - return Ok(()); - } - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - // check whether the Meilisearch instance exited early (cut the wait) - if let Some(exit_code) = - meilisearch.try_wait().context("cannot check Meilisearch server process status")? - { - bail!("Meilisearch server exited early with code {exit_code}"); - } - tracing::debug!(attempt = i, "Waiting for Meilisearch to go up"); - } - bail!("meilisearch is not responding") -} - -fn health_command() -> Command { - Command { - route: "/health".into(), - method: Method::GET, - body: Default::default(), - synchronous: SyncMode::WaitForResponse, - } -} - -fn delete_db() { - let _ = std::fs::remove_dir_all("./_xtask_benchmark.ms"); -} - -async fn run_commands( - dashboard_client: &Client, - logs_client: &Client, - meili_client: &Client, - workload_uuid: Uuid, - workload: &Workload, - args: &BenchDeriveArgs, - run_number: u16, -) -> anyhow::Result>> { - let report_folder = &args.report_folder; - let workload_name = &workload.name; - - std::fs::create_dir_all(report_folder) - .with_context(|| format!("could not create report directory at {report_folder}"))?; - - let trace_filename = format!("{report_folder}/{workload_name}-{run_number}-trace.json"); - let report_filename = format!("{report_folder}/{workload_name}-{run_number}-report.json"); - - let report_handle = start_report(logs_client, trace_filename).await?; - - for batch in workload - .commands - .as_slice() - .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) - { - run_batch(meili_client, batch, &workload.assets, &args.asset_folder).await?; - } - - let processor = - stop_report(dashboard_client, logs_client, workload_uuid, report_filename, report_handle) - .await?; - - Ok(processor) -} - -async fn stop_report( - dashboard_client: &Client, - logs_client: &Client, - workload_uuid: Uuid, - filename: String, - report_handle: tokio::task::JoinHandle>, -) -> anyhow::Result>> { - let response = logs_client.delete("").send().await.context("while stopping report")?; - if !response.status().is_success() { - bail!("received HTTP {} while stopping report", response.status()) - } - - let mut file = tokio::time::timeout(std::time::Duration::from_secs(1000), report_handle) - .await - .context("while waiting for the end of the report")? - .context("report writing task panicked")? - .context("while writing report")?; - - file.rewind().context("while rewinding report file")?; - - let process_handle = tokio::task::spawn({ - let dashboard_client = dashboard_client.clone(); - async move { - let span = tracing::info_span!("processing trace to report", filename); - let _guard = span.enter(); - let report = tracing_trace::processor::span_stats::to_call_stats( - tracing_trace::TraceReader::new(std::io::BufReader::new(file)), - ) - .context("could not convert trace to report")?; - let context = || format!("writing report to {filename}"); - - let response = dashboard_client - .put("run") - .json(&json!({ - "workload_uuid": workload_uuid, - "data": report - })) - .send() - .await - .context("sending new run")?; - - if !response.status().is_success() { - bail!( - "sending new run failed: {}", - response.text().await.unwrap_or_else(|_| "unknown".into()) - ) - } - - let mut output_file = std::io::BufWriter::new( - std::fs::File::options() - .create(true) - .truncate(true) - .write(true) - .read(true) - .open(&filename) - .with_context(context)?, - ); - - for (key, value) in report { - serde_json::to_writer(&mut output_file, &json!({key: value})) - .context("serializing span stat")?; - writeln!(&mut output_file).with_context(context)?; - } - output_file.flush().with_context(context)?; - let mut output_file = output_file.into_inner().with_context(context)?; - - output_file.rewind().context("could not rewind output_file").with_context(context)?; - - tracing::info!("success"); - Ok(output_file) - } - }); - - Ok(process_handle) -} - -async fn start_report( - logs_client: &Client, - filename: String, -) -> anyhow::Result>> { - let report_file = std::fs::File::options() - .create(true) - .truncate(true) - .write(true) - .read(true) - .open(&filename) - .with_context(|| format!("could not create file at {filename}"))?; - let mut report_file = std::io::BufWriter::new(report_file); - - let response = logs_client - .post("") - .json(&json!({ - "mode": "profile", - "target": "indexing::=trace" - })) - .send() - .await - .context("failed to start report")?; - - let code = response.status(); - if code.is_client_error() { - tracing::error!(%code, "request error when trying to start report"); - let response: serde_json::Value = response - .json() - .await - .context("could not deserialize response as JSON") - .context("response error when trying to start report")?; - bail!( - "request error when trying to start report: server responded with error code {code} and '{response}'" - ) - } else if code.is_server_error() { - tracing::error!(%code, "server error when trying to start report"); - let response: serde_json::Value = response - .json() - .await - .context("could not deserialize response as JSON") - .context("response error trying to start report")?; - bail!("server error when trying to start report: server responded with error code {code} and '{response}'") - } - - Ok(tokio::task::spawn(async move { - let mut stream = response.bytes_stream(); - while let Some(bytes) = stream.try_next().await.context("while waiting for report")? { - report_file - .write_all(&bytes) - .with_context(|| format!("while writing report to {filename}"))?; - } - report_file.into_inner().with_context(|| format!("while writing report to {filename}")) - })) -} - -async fn run_batch( - client: &Client, - batch: &[Command], - assets: &BTreeMap, - asset_folder: &str, -) -> anyhow::Result<()> { - let [.., last] = batch else { return Ok(()) }; - let sync = last.synchronous; - - let mut tasks = tokio::task::JoinSet::new(); - - for command in batch { - // FIXME: you probably don't want to copy assets everytime here - tasks.spawn({ - let client = client.clone(); - let command = command.clone(); - let assets = assets.clone(); - let asset_folder = asset_folder.to_owned(); - - async move { run_command(client, command, &assets, &asset_folder).await } - }); - } - - while let Some(result) = tasks.join_next().await { - result - .context("panicked while executing command")? - .context("error while executing command")?; - } - - match sync { - SyncMode::DontWait => {} - SyncMode::WaitForResponse => {} - SyncMode::WaitForTask => wait_for_tasks(client).await?, - } - - Ok(()) -} - -async fn wait_for_tasks(client: &Client) -> anyhow::Result<()> { - loop { - let response = client - .get("tasks?statuses=enqueued,processing") - .send() - .await - .context("could not wait for tasks")?; - let response: serde_json::Value = response - .json() - .await - .context("could not deserialize response to JSON") - .context("could not wait for tasks")?; - match response.get("total") { - Some(serde_json::Value::Number(number)) => { - let number = number.as_u64().with_context(|| { - format!("waiting for tasks: could not parse 'total' as integer, got {}", number) - })?; - if number == 0 { - break; - } else { - tokio::time::sleep(std::time::Duration::from_secs(1)).await; - continue; - } - } - Some(thing_else) => { - bail!(format!( - "waiting for tasks: could not parse 'total' as a number, got '{thing_else}'" - )) - } - None => { - bail!(format!( - "waiting for tasks: expected response to contain 'total', got '{response}'" - )) - } - } - } - Ok(()) -} - -#[tracing::instrument(skip(client, command, assets, asset_folder), fields(command = %command))] -async fn run_command( - client: Client, - mut command: Command, - assets: &BTreeMap, - asset_folder: &str, -) -> anyhow::Result<()> { - // memtake the body here to leave an empty body in its place, so that command is not partially moved-out - let body = std::mem::take(&mut command.body) - .get(assets, asset_folder) - .with_context(|| format!("while getting body for command {command}"))?; - - let request = client.request(command.method.into(), &command.route); - - let request = if let Some((body, content_type)) = body { - request.body(body).header(reqwest::header::CONTENT_TYPE, content_type) - } else { - request - }; - - let response = - request.send().await.with_context(|| format!("error sending command: {}", command))?; - - let code = response.status(); - if code.is_client_error() { - tracing::error!(%command, %code, "error in workload file"); - let response: serde_json::Value = response - .json() - .await - .context("could not deserialize response as JSON") - .context("parsing error in workload file when sending command")?; - bail!("error in workload file: server responded with error code {code} and '{response}'") - } else if code.is_server_error() { - tracing::error!(%command, %code, "server error"); - let response: serde_json::Value = response - .json() - .await - .context("could not deserialize response as JSON") - .context("parsing server error when sending command")?; - bail!("server error: server responded with error code {code} and '{response}'") - } - - Ok(()) -} diff --git a/xtask/src/bench/workload.rs b/xtask/src/bench/workload.rs new file mode 100644 index 000000000..b3e952f29 --- /dev/null +++ b/xtask/src/bench/workload.rs @@ -0,0 +1,262 @@ +use std::collections::BTreeMap; +use std::fs::File; +use std::io::{Seek as _, Write as _}; + +use anyhow::{bail, Context as _}; +use futures_util::TryStreamExt as _; +use serde::Deserialize; +use serde_json::json; +use tokio::task::JoinHandle; +use uuid::Uuid; + +use super::assets::Asset; +use super::client::Client; +use super::command::SyncMode; +use super::BenchDeriveArgs; +use crate::bench::{assets, dashboard, meili_process}; + +#[derive(Deserialize)] +pub struct Workload { + pub name: String, + pub run_count: u16, + pub extra_cli_args: Vec, + pub assets: BTreeMap, + pub commands: Vec, +} + +async fn run_commands( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + let report_folder = &args.report_folder; + let workload_name = &workload.name; + + std::fs::create_dir_all(report_folder) + .with_context(|| format!("could not create report directory at {report_folder}"))?; + + let trace_filename = format!("{report_folder}/{workload_name}-{run_number}-trace.json"); + let report_filename = format!("{report_folder}/{workload_name}-{run_number}-report.json"); + + let report_handle = start_report(logs_client, trace_filename).await?; + + for batch in workload + .commands + .as_slice() + .split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait)) + { + super::command::run_batch(meili_client, batch, &workload.assets, &args.asset_folder) + .await?; + } + + let processor = + stop_report(dashboard_client, logs_client, workload_uuid, report_filename, report_handle) + .await?; + + Ok(processor) +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))] +pub async fn execute( + assets_client: &Client, + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + invocation_uuid: Uuid, + master_key: Option<&str>, + workload: Workload, + args: &BenchDeriveArgs, +) -> anyhow::Result<()> { + assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; + + let workload_uuid = + dashboard::create_workload(dashboard_client, invocation_uuid, &workload).await?; + + let mut tasks = Vec::new(); + + for i in 0..workload.run_count { + tasks.push( + execute_run( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + master_key, + &workload, + args, + i, + ) + .await?, + ); + } + + let mut reports = Vec::with_capacity(workload.run_count as usize); + + for task in tasks { + reports.push( + task.await + .context("task panicked while processing report")? + .context("task failed while processing report")?, + ); + } + + tracing::info!(workload = workload.name, "Successful workload"); + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner +#[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))] +async fn execute_run( + dashboard_client: &Client, + logs_client: &Client, + meili_client: &Client, + workload_uuid: Uuid, + master_key: Option<&str>, + workload: &Workload, + args: &BenchDeriveArgs, + run_number: u16, +) -> anyhow::Result>> { + meili_process::delete_db(); + + meili_process::build().await?; + let meilisearch = + meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?; + + let processor = run_commands( + dashboard_client, + logs_client, + meili_client, + workload_uuid, + workload, + args, + run_number, + ) + .await?; + + meili_process::kill(meilisearch).await; + + tracing::info!(run_number, "Successful run"); + + Ok(processor) +} + +async fn start_report( + logs_client: &Client, + filename: String, +) -> anyhow::Result>> { + let report_file = std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(|| format!("could not create file at {filename}"))?; + let mut report_file = std::io::BufWriter::new(report_file); + + let response = logs_client + .post("") + .json(&json!({ + "mode": "profile", + "target": "indexing::=trace" + })) + .send() + .await + .context("failed to start report")?; + + let code = response.status(); + if code.is_client_error() { + tracing::error!(%code, "request error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error when trying to start report")?; + bail!( + "request error when trying to start report: server responded with error code {code} and '{response}'" + ) + } else if code.is_server_error() { + tracing::error!(%code, "server error when trying to start report"); + let response: serde_json::Value = response + .json() + .await + .context("could not deserialize response as JSON") + .context("response error trying to start report")?; + bail!("server error when trying to start report: server responded with error code {code} and '{response}'") + } + + Ok(tokio::task::spawn(async move { + let mut stream = response.bytes_stream(); + while let Some(bytes) = stream.try_next().await.context("while waiting for report")? { + report_file + .write_all(&bytes) + .with_context(|| format!("while writing report to {filename}"))?; + } + report_file.into_inner().with_context(|| format!("while writing report to {filename}")) + })) +} + +async fn stop_report( + dashboard_client: &Client, + logs_client: &Client, + workload_uuid: Uuid, + filename: String, + report_handle: tokio::task::JoinHandle>, +) -> anyhow::Result>> { + let response = logs_client.delete("").send().await.context("while stopping report")?; + if !response.status().is_success() { + bail!("received HTTP {} while stopping report", response.status()) + } + + let mut file = tokio::time::timeout(std::time::Duration::from_secs(1000), report_handle) + .await + .context("while waiting for the end of the report")? + .context("report writing task panicked")? + .context("while writing report")?; + + file.rewind().context("while rewinding report file")?; + + let process_handle = tokio::task::spawn({ + let dashboard_client = dashboard_client.clone(); + async move { + let span = tracing::info_span!("processing trace to report", filename); + let _guard = span.enter(); + let report = tracing_trace::processor::span_stats::to_call_stats( + tracing_trace::TraceReader::new(std::io::BufReader::new(file)), + ) + .context("could not convert trace to report")?; + let context = || format!("writing report to {filename}"); + + dashboard::create_run(dashboard_client, workload_uuid, &report).await?; + + let mut output_file = std::io::BufWriter::new( + std::fs::File::options() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&filename) + .with_context(context)?, + ); + + for (key, value) in report { + serde_json::to_writer(&mut output_file, &json!({key: value})) + .context("serializing span stat")?; + writeln!(&mut output_file).with_context(context)?; + } + output_file.flush().with_context(context)?; + let mut output_file = output_file.into_inner().with_context(context)?; + + output_file.rewind().context("could not rewind output_file").with_context(context)?; + + Ok(output_file) + } + }); + + Ok(process_handle) +} From 15c38dca78a9dd1377715a4aecca4f762aebdc02 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 14:44:48 +0100 Subject: [PATCH 50/52] Output RFC 3339 dates where we can Co-authored-by: Tamo --- meilisearch/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index 79ca7ec80..35658fc92 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -195,7 +195,7 @@ pub fn print_launch_resume( build_info .commit_timestamp .and_then(|commit_timestamp| commit_timestamp - .format(&time::format_description::well_known::Iso8601::DEFAULT) + .format(&time::format_description::well_known::Rfc3339::DEFAULT) .ok()) .unwrap_or("unknown".into()) ); From 663629a9d60fbace1e498fe72f7c40bbfcdac998 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 14:45:06 +0100 Subject: [PATCH 51/52] Remove unused build dependency from xtask Co-authored-by: Tamo --- xtask/Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 0df8161ce..562dfddb3 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -42,6 +42,3 @@ tracing = "0.1.40" tracing-subscriber = "0.3.18" tracing-trace = { version = "0.1.0", path = "../tracing-trace" } uuid = { version = "1.7.0", features = ["v7", "serde"] } - -[build-dependencies] -anyhow = "1.0.79" From 7408db2a46a3620d21b5c7f414c71431d1343ff9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 5 Mar 2024 14:56:48 +0100 Subject: [PATCH 52/52] Meilisearch: fix date formatting --- meilisearch/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index 35658fc92..3451325b2 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -195,7 +195,7 @@ pub fn print_launch_resume( build_info .commit_timestamp .and_then(|commit_timestamp| commit_timestamp - .format(&time::format_description::well_known::Rfc3339::DEFAULT) + .format(&time::format_description::well_known::Rfc3339) .ok()) .unwrap_or("unknown".into()) );