mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
WIP arroy integration
This commit is contained in:
parent
13c2c6c16b
commit
dde3a04679
10 changed files with 280 additions and 326 deletions
|
@ -22,7 +22,6 @@ use crate::heed_codec::{
|
|||
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
|
||||
};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::readable_slices::ReadableSlices;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
|
@ -49,10 +48,6 @@ pub mod main_key {
|
|||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||
/// The prefix of the key that is used to store the, potential big, HNSW structure.
|
||||
/// It is concatenated with a big-endian encoded number (non-human readable).
|
||||
/// e.g. vector-hnsw0x0032.
|
||||
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
||||
|
@ -75,6 +70,7 @@ pub mod main_key {
|
|||
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
|
||||
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
||||
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
||||
pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids";
|
||||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||
}
|
||||
|
||||
|
@ -102,6 +98,9 @@ pub mod db_name {
|
|||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||
pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids";
|
||||
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
||||
pub const VECTOR_ARROY: &str = "vector-arroy";
|
||||
pub const DOCUMENTS: &str = "documents";
|
||||
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
||||
}
|
||||
|
@ -168,8 +167,16 @@ pub struct Index {
|
|||
/// Maps the document id, the facet field id and the strings.
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps a vector id to the document id that have it.
|
||||
/// Maps a vector id to its document id.
|
||||
pub vector_id_docid: Database<BEU32, BEU32>,
|
||||
/// Maps a doc id to its vector ids.
|
||||
pub docid_vector_ids: Database<BEU32, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps an embedder name to its id in the arroy store.
|
||||
pub embedder_category_id: Database<Str, BEU16>,
|
||||
|
||||
/// Vector store based on arroy™.
|
||||
pub vector_arroy: arroy::Database<arroy::distances::DotProduct>,
|
||||
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
||||
|
@ -184,7 +191,7 @@ impl Index {
|
|||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(24);
|
||||
options.max_dbs(27);
|
||||
|
||||
let env = options.open(path)?;
|
||||
let mut wtxn = env.write_txn()?;
|
||||
|
@ -224,7 +231,13 @@ impl Index {
|
|||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
// vector stuff
|
||||
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
||||
let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?;
|
||||
let embedder_category_id =
|
||||
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
||||
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
|
||||
|
||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
|
@ -255,6 +268,9 @@ impl Index {
|
|||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
vector_arroy,
|
||||
docid_vector_ids,
|
||||
embedder_category_id,
|
||||
documents,
|
||||
})
|
||||
}
|
||||
|
@ -477,63 +493,6 @@ impl Index {
|
|||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/* vector HNSW */
|
||||
|
||||
/// Writes the provided `hnsw`.
|
||||
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
|
||||
// We must delete all the chunks before we write the new HNSW chunks.
|
||||
self.delete_vector_hnsw(wtxn)?;
|
||||
|
||||
let chunk_size = 1024 * 1024 * (1024 + 512); // 1.5 GiB
|
||||
let bytes = bincode::serialize(hnsw).map_err(Into::into).map_err(heed::Error::Encoding)?;
|
||||
for (i, chunk) in bytes.chunks(chunk_size).enumerate() {
|
||||
let i = i as u32;
|
||||
let mut key = main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes().to_vec();
|
||||
key.extend_from_slice(&i.to_be_bytes());
|
||||
self.main.remap_types::<Bytes, Bytes>().put(wtxn, &key, chunk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete the `hnsw`.
|
||||
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
let mut iter = self
|
||||
.main
|
||||
.remap_types::<Bytes, DecodeIgnore>()
|
||||
.prefix_iter_mut(wtxn, main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes())?;
|
||||
let mut deleted = false;
|
||||
while iter.next().transpose()?.is_some() {
|
||||
// We do not keep a reference to the key or the value.
|
||||
unsafe { deleted |= iter.del_current()? };
|
||||
}
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
/// Returns the `hnsw`.
|
||||
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
|
||||
let mut slices = Vec::new();
|
||||
for result in self
|
||||
.main
|
||||
.remap_types::<Str, Bytes>()
|
||||
.prefix_iter(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)?
|
||||
{
|
||||
let (_, slice) = result?;
|
||||
slices.push(slice);
|
||||
}
|
||||
|
||||
if slices.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let readable_slices: ReadableSlices<_> = slices.into_iter().collect();
|
||||
Ok(Some(
|
||||
bincode::deserialize_from(readable_slices)
|
||||
.map_err(Into::into)
|
||||
.map_err(heed::Error::Decoding)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/* field distribution */
|
||||
|
||||
/// Writes the field distribution which associates every field name with
|
||||
|
@ -1557,6 +1516,30 @@ impl Index {
|
|||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_unavailable_vector_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
unavailable_vector_ids: RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, CboRoaringBitmapCodec>().put(
|
||||
wtxn,
|
||||
main_key::VECTOR_UNAVAILABLE_VECTOR_IDS,
|
||||
&unavailable_vector_ids,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)
|
||||
}
|
||||
|
||||
pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result<RoaringBitmap> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, CboRoaringBitmapCodec>()
|
||||
.get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue