mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge #4971
4971: update arroy r=dureuill a=irevoire # Pull Request Fix part of https://github.com/meilisearch/meilisearch/issues/3715 ## What does this PR do? - Update arroy to the latest version, most change are maintenance changes - The performances of adding vectors to arroy should slightly improve - Forward the build cancellation function to arroy so it can stop building trees when we have to stop an indexing process Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
cc669f90d5
5
Cargo.lock
generated
5
Cargo.lock
generated
@ -386,8 +386,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.4.0"
|
version = "0.5.0"
|
||||||
source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
@ -40,7 +40,7 @@ ureq = "2.10.0"
|
|||||||
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = "0.5.0"
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
crossbeam = "0.8.4"
|
crossbeam = "0.8.4"
|
||||||
insta = { version = "1.39.0", features = ["json", "redactions"] }
|
insta = { version = "1.39.0", features = ["json", "redactions"] }
|
||||||
|
@ -80,7 +80,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
|
|||||||
tiktoken-rs = "0.5.9"
|
tiktoken-rs = "0.5.9"
|
||||||
liquid = "0.26.6"
|
liquid = "0.26.6"
|
||||||
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
||||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
arroy = "0.5.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
ureq = { version = "2.10.0", features = ["json"] }
|
ureq = { version = "2.10.0", features = ["json"] }
|
||||||
|
@ -297,6 +297,7 @@ impl From<arroy::Error> for Error {
|
|||||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||||
}
|
}
|
||||||
|
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
||||||
arroy::Error::DatabaseFull
|
arroy::Error::DatabaseFull
|
||||||
| arroy::Error::InvalidItemAppend
|
| arroy::Error::InvalidItemAppend
|
||||||
| arroy::Error::UnmatchingDistance { .. }
|
| arroy::Error::UnmatchingDistance { .. }
|
||||||
|
@ -699,6 +699,7 @@ where
|
|||||||
for (embedder_name, dimension) in dimension {
|
for (embedder_name, dimension) in dimension {
|
||||||
let wtxn = &mut *self.wtxn;
|
let wtxn = &mut *self.wtxn;
|
||||||
let vector_arroy = self.index.vector_arroy;
|
let vector_arroy = self.index.vector_arroy;
|
||||||
|
let cancel = &self.should_abort;
|
||||||
|
|
||||||
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||||
@ -713,7 +714,7 @@ where
|
|||||||
|
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
||||||
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing)?;
|
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?;
|
||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
})
|
})
|
||||||
.map_err(InternalError::from)??;
|
.map_err(InternalError::from)??;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use arroy::distances::{Angular, BinaryQuantizedAngular};
|
use arroy::distances::{BinaryQuantizedCosine, Cosine};
|
||||||
use arroy::ItemId;
|
use arroy::ItemId;
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
use heed::{RoTxn, RwTxn, Unspecified};
|
use heed::{RoTxn, RwTxn, Unspecified};
|
||||||
@ -82,12 +82,13 @@ impl ArroyWrapper {
|
|||||||
rng: &mut R,
|
rng: &mut R,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
quantizing: bool,
|
quantizing: bool,
|
||||||
|
cancel: &(impl Fn() -> bool + Sync + Send),
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), arroy::Error> {
|
||||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
if writer.need_build(wtxn)? {
|
if writer.need_build(wtxn)? {
|
||||||
writer.build(wtxn, rng, None)?
|
writer.builder(rng).build(wtxn)?
|
||||||
} else if writer.is_empty(wtxn)? {
|
} else if writer.is_empty(wtxn)? {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -99,11 +100,10 @@ impl ArroyWrapper {
|
|||||||
// only happens once in the life of an embedder, it's not very performances
|
// only happens once in the life of an embedder, it's not very performances
|
||||||
// sensitive.
|
// sensitive.
|
||||||
if quantizing && !self.quantized {
|
if quantizing && !self.quantized {
|
||||||
let writer =
|
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
|
||||||
writer.prepare_changing_distance::<BinaryQuantizedAngular>(wtxn)?;
|
writer.builder(rng).cancel(cancel).build(wtxn)?;
|
||||||
writer.build(wtxn, rng, None)?
|
|
||||||
} else if writer.need_build(wtxn)? {
|
} else if writer.need_build(wtxn)? {
|
||||||
writer.build(wtxn, rng, None)?
|
writer.builder(rng).cancel(cancel).build(wtxn)?;
|
||||||
} else if writer.is_empty(wtxn)? {
|
} else if writer.is_empty(wtxn)? {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -323,8 +323,13 @@ impl ArroyWrapper {
|
|||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
for reader in self.readers(rtxn, db) {
|
for reader in self.readers(rtxn, db) {
|
||||||
let ret = reader?.nns_by_item(rtxn, item, limit, None, None, filter)?;
|
let reader = reader?;
|
||||||
if let Some(mut ret) = ret {
|
let mut searcher = reader.nns(limit);
|
||||||
|
if let Some(filter) = filter {
|
||||||
|
searcher.candidates(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
|
||||||
results.append(&mut ret);
|
results.append(&mut ret);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
@ -359,8 +364,13 @@ impl ArroyWrapper {
|
|||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
for reader in self.readers(rtxn, db) {
|
for reader in self.readers(rtxn, db) {
|
||||||
let mut ret = reader?.nns_by_vector(rtxn, vector, limit, None, None, filter)?;
|
let reader = reader?;
|
||||||
results.append(&mut ret);
|
let mut searcher = reader.nns(limit);
|
||||||
|
if let Some(filter) = filter {
|
||||||
|
searcher.candidates(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(&mut searcher.by_vector(rtxn, vector)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||||
@ -391,11 +401,11 @@ impl ArroyWrapper {
|
|||||||
Ok(vectors)
|
Ok(vectors)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn angular_db(&self) -> arroy::Database<Angular> {
|
fn angular_db(&self) -> arroy::Database<Cosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedAngular> {
|
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user