Add support for the progress API of arroy

This commit is contained in:
Tamo 2025-03-13 17:36:49 +01:00
parent 82912e191b
commit 009c36a4d0
7 changed files with 67 additions and 4 deletions

7
Cargo.lock generated
View File

@ -393,12 +393,13 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]] [[package]]
name = "arroy" name = "arroy"
version = "0.6.0" version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a885313dfac15b64fd61a39d1970a2befa076c69a763434117c5b6163f9fecb" checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",
"enum-iterator",
"heed", "heed",
"memmap2", "memmap2",
"nohash", "nohash",
@ -3017,7 +3018,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"windows-targets 0.48.1", "windows-targets 0.52.6",
] ]
[[package]] [[package]]

View File

@ -87,7 +87,7 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838
"no_time", "no_time",
"sync", "sync",
] } ] }
arroy = "0.6.0" arroy = "0.6.1"
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.41" tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] } ureq = { version = "2.12.1", features = ["json"] }

View File

@ -1,3 +1,4 @@
use enum_iterator::Sequence;
use std::any::TypeId; use std::any::TypeId;
use std::borrow::Cow; use std::borrow::Cow;
use std::marker::PhantomData; use std::marker::PhantomData;
@ -76,6 +77,14 @@ impl Progress {
durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect() durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect()
} }
// TODO: ideally we should expose the progress in a way that let arroy use it directly
pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) {
self.update_progress(progress.main);
if let Some(sub) = progress.sub {
self.update_progress(sub);
}
}
} }
/// Generate the names associated with the durations and push them. /// Generate the names associated with the durations and push them.
@ -238,3 +247,44 @@ impl<U: Send + Sync + 'static> Step for VariableNameStep<U> {
self.total self.total
} }
} }
impl Step for arroy::MainStep {
fn name(&self) -> Cow<'static, str> {
match self {
arroy::MainStep::PreProcessingTheItems => "pre processing the items",
arroy::MainStep::WritingTheDescendantsAndMetadata => {
"writing the descendants and metadata"
}
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
arroy::MainStep::UpdatingTheTrees => "updating the trees",
arroy::MainStep::CreateNewTrees => "create new trees",
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
arroy::MainStep::WriteTheMetadata => "write the metadata",
}
.into()
}
fn current(&self) -> u32 {
*self as u32
}
fn total(&self) -> u32 {
Self::CARDINALITY as u32
}
}
impl Step for arroy::SubStep {
fn name(&self) -> Cow<'static, str> {
self.unit.into()
}
fn current(&self) -> u32 {
self.current.load(Ordering::Relaxed)
}
fn total(&self) -> u32 {
self.max
}
}

View File

@ -31,6 +31,7 @@ use super::new::StdResult;
use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError}; use crate::error::{Error, InternalError};
use crate::index::{PrefixSearch, PrefixSettings}; use crate::index::{PrefixSearch, PrefixSettings};
use crate::progress::Progress;
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
@ -522,6 +523,8 @@ where
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
writer.build_and_quantize( writer.build_and_quantize(
wtxn, wtxn,
// In the settings we don't have any progress to share
&Progress::default(),
&mut rng, &mut rng,
dimension, dimension,
is_quantizing, is_quantizing,

View File

@ -201,6 +201,7 @@ where
build_vectors( build_vectors(
index, index,
wtxn, wtxn,
indexing_context.progress,
index_embeddings, index_embeddings,
arroy_memory, arroy_memory,
&mut arroy_writers, &mut arroy_writers,

View File

@ -10,6 +10,7 @@ use super::super::channel::*;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig; use crate::index::IndexEmbeddingConfig;
use crate::progress::Progress;
use crate::update::settings::InnerIndexSettings; use crate::update::settings::InnerIndexSettings;
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings};
use crate::{Error, Index, InternalError, Result}; use crate::{Error, Index, InternalError, Result};
@ -100,6 +101,7 @@ impl ChannelCongestion {
pub fn build_vectors<MSP>( pub fn build_vectors<MSP>(
index: &Index, index: &Index,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
progress: &Progress,
index_embeddings: Vec<IndexEmbeddingConfig>, index_embeddings: Vec<IndexEmbeddingConfig>,
arroy_memory: Option<usize>, arroy_memory: Option<usize>,
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>, arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
@ -118,6 +120,7 @@ where
let dimensions = *dimensions; let dimensions = *dimensions;
writer.build_and_quantize( writer.build_and_quantize(
wtxn, wtxn,
progress,
&mut rng, &mut rng,
dimensions, dimensions,
false, false,

View File

@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
use utoipa::ToSchema; use utoipa::ToSchema;
use self::error::{EmbedError, NewEmbedderError}; use self::error::{EmbedError, NewEmbedderError};
use crate::progress::Progress;
use crate::prompt::{Prompt, PromptData}; use crate::prompt::{Prompt, PromptData};
use crate::ThreadPoolNoAbort; use crate::ThreadPoolNoAbort;
@ -81,9 +82,11 @@ impl ArroyWrapper {
} }
} }
#[allow(clippy::too_many_arguments)]
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>( pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
&mut self, &mut self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
progress: &Progress,
rng: &mut R, rng: &mut R,
dimension: usize, dimension: usize,
quantizing: bool, quantizing: bool,
@ -110,12 +113,14 @@ impl ArroyWrapper {
writer writer
.builder(rng) .builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX)) .available_memory(arroy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step))
.cancel(cancel) .cancel(cancel)
.build(wtxn)?; .build(wtxn)?;
} else if writer.need_build(wtxn)? { } else if writer.need_build(wtxn)? {
writer writer
.builder(rng) .builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX)) .available_memory(arroy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step))
.cancel(cancel) .cancel(cancel)
.build(wtxn)?; .build(wtxn)?;
} else if writer.is_empty(wtxn)? { } else if writer.is_empty(wtxn)? {