mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-22 20:50:04 +01:00
WIP arroy integration
This commit is contained in:
parent
13c2c6c16b
commit
dde3a04679
181
Cargo.lock
generated
181
Cargo.lock
generated
@ -380,6 +380,24 @@ dependencies = [
|
||||
"derive_arbitrary",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arroy"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/meilisearch/arroy.git#4b59476f457e5443ff250ea10d40d8b66a692674"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"heed",
|
||||
"log",
|
||||
"memmap2 0.9.0",
|
||||
"ordered-float 4.2.0",
|
||||
"rand",
|
||||
"rayon",
|
||||
"roaring",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assert-json-diff"
|
||||
version = "2.0.2"
|
||||
@ -537,9 +555,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.3.3"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
|
||||
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@ -629,9 +647,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
@ -687,7 +705,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"gemm",
|
||||
"half 2.3.1",
|
||||
"memmap2",
|
||||
"memmap2 0.7.1",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"rand",
|
||||
@ -1561,23 +1579,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.2"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f"
|
||||
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno-dragonfly"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2117,7 +2124,7 @@ version = "0.20.0-alpha.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"bitflags 2.4.1",
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"heed-traits",
|
||||
@ -2868,7 +2875,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c619cdaa30bb84088963968bee12a45ea5fbbf355f2c021bcd15589f5ca494a"
|
||||
dependencies = [
|
||||
"num_cpus",
|
||||
"ordered-float",
|
||||
"ordered-float 3.7.0",
|
||||
"parking_lot",
|
||||
"rand",
|
||||
"rayon",
|
||||
@ -2911,7 +2918,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"rustix 0.38.7",
|
||||
"rustix 0.38.26",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@ -3294,9 +3301,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.5"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
|
||||
checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456"
|
||||
|
||||
[[package]]
|
||||
name = "liquid"
|
||||
@ -3412,9 +3419,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.19"
|
||||
version = "0.4.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
|
||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
|
||||
[[package]]
|
||||
name = "logging_timer"
|
||||
@ -3543,7 +3550,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"obkv",
|
||||
"once_cell",
|
||||
"ordered-float",
|
||||
"ordered-float 3.7.0",
|
||||
"parking_lot",
|
||||
"permissive-json-pointer",
|
||||
"pin-project-lite",
|
||||
@ -3618,7 +3625,7 @@ dependencies = [
|
||||
"fst",
|
||||
"insta",
|
||||
"meili-snap",
|
||||
"memmap2",
|
||||
"memmap2 0.7.1",
|
||||
"milli",
|
||||
"roaring",
|
||||
"serde",
|
||||
@ -3662,6 +3669,15 @@ dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memmap2"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "deaba38d7abf1d4cca21cc89e932e542ba2b9258664d2a9ef0e61512039c9375"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.9.0"
|
||||
@ -3675,6 +3691,7 @@ dependencies = [
|
||||
name = "milli"
|
||||
version = "1.5.1"
|
||||
dependencies = [
|
||||
"arroy",
|
||||
"big_s",
|
||||
"bimap",
|
||||
"bincode",
|
||||
@ -3711,12 +3728,12 @@ dependencies = [
|
||||
"maplit",
|
||||
"md5",
|
||||
"meili-snap",
|
||||
"memmap2",
|
||||
"memmap2 0.7.1",
|
||||
"mimalloc",
|
||||
"nolife",
|
||||
"obkv",
|
||||
"once_cell",
|
||||
"ordered-float",
|
||||
"ordered-float 3.7.0",
|
||||
"puffin",
|
||||
"rand",
|
||||
"rand_pcg",
|
||||
@ -3983,7 +4000,7 @@ version = "0.10.59"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a257ad03cd8fb16ad4172fedf8094451e1af1c4b70097636ef2eac9a5f0cc33"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"bitflags 2.4.1",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
@ -4036,6 +4053,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "4.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "page_size"
|
||||
version = "0.5.0"
|
||||
@ -4553,6 +4579,15 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.3"
|
||||
@ -4736,15 +4771,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.7"
|
||||
version = "0.38.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399"
|
||||
checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a"
|
||||
dependencies = [
|
||||
"bitflags 2.3.3",
|
||||
"bitflags 2.4.1",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.4.5",
|
||||
"windows-sys 0.48.0",
|
||||
"linux-raw-sys 0.4.12",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -5279,14 +5314,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.7.1"
|
||||
version = "3.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
|
||||
checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.38.7",
|
||||
"redox_syscall 0.4.1",
|
||||
"rustix 0.38.26",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@ -5997,6 +6032,15 @@ dependencies = [
|
||||
"windows-targets 0.48.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
@ -6027,6 +6071,21 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm 0.52.0",
|
||||
"windows_aarch64_msvc 0.52.0",
|
||||
"windows_i686_gnu 0.52.0",
|
||||
"windows_i686_msvc 0.52.0",
|
||||
"windows_x86_64_gnu 0.52.0",
|
||||
"windows_x86_64_gnullvm 0.52.0",
|
||||
"windows_x86_64_msvc 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@ -6039,6 +6098,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.2"
|
||||
@ -6051,6 +6116,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.2"
|
||||
@ -6063,6 +6134,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.2"
|
||||
@ -6075,6 +6152,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.2"
|
||||
@ -6087,6 +6170,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@ -6099,6 +6188,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.2"
|
||||
@ -6111,6 +6206,12 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.5.4"
|
||||
|
@ -89,6 +89,8 @@ reqwest = { version = "0.11.16", features = [
|
||||
], default-features = false }
|
||||
tiktoken-rs = "0.5.7"
|
||||
liquid = "0.26.4"
|
||||
arroy = { git = "https://github.com/meilisearch/arroy.git", version = "0.1.0" }
|
||||
rand = "0.8.5"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.37", default-features = false }
|
||||
@ -100,15 +102,7 @@ meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
all-tokenizations = [
|
||||
"charabia/chinese",
|
||||
"charabia/hebrew",
|
||||
"charabia/japanese",
|
||||
"charabia/thai",
|
||||
"charabia/korean",
|
||||
"charabia/greek",
|
||||
"charabia/khmer",
|
||||
]
|
||||
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
# For more information on this feature, see heed's Cargo.toml
|
||||
|
@ -22,7 +22,6 @@ use crate::heed_codec::{
|
||||
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
|
||||
};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::readable_slices::ReadableSlices;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
@ -49,10 +48,6 @@ pub mod main_key {
|
||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||
/// The prefix of the key that is used to store the, potential big, HNSW structure.
|
||||
/// It is concatenated with a big-endian encoded number (non-human readable).
|
||||
/// e.g. vector-hnsw0x0032.
|
||||
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
||||
@ -75,6 +70,7 @@ pub mod main_key {
|
||||
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
|
||||
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
||||
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
||||
pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids";
|
||||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||
}
|
||||
|
||||
@ -102,6 +98,9 @@ pub mod db_name {
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||
pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids";
|
||||
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
||||
pub const VECTOR_ARROY: &str = "vector-arroy";
|
||||
pub const DOCUMENTS: &str = "documents";
|
||||
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
||||
}
|
||||
@ -168,8 +167,16 @@ pub struct Index {
|
||||
/// Maps the document id, the facet field id and the strings.
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps a vector id to the document id that have it.
|
||||
/// Maps a vector id to its document id.
|
||||
pub vector_id_docid: Database<BEU32, BEU32>,
|
||||
/// Maps a doc id to its vector ids.
|
||||
pub docid_vector_ids: Database<BEU32, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps an embedder name to its id in the arroy store.
|
||||
pub embedder_category_id: Database<Str, BEU16>,
|
||||
|
||||
/// Vector store based on arroy™.
|
||||
pub vector_arroy: arroy::Database<arroy::distances::DotProduct>,
|
||||
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
||||
@ -184,7 +191,7 @@ impl Index {
|
||||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(24);
|
||||
options.max_dbs(27);
|
||||
|
||||
let env = options.open(path)?;
|
||||
let mut wtxn = env.write_txn()?;
|
||||
@ -224,7 +231,13 @@ impl Index {
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
// vector stuff
|
||||
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
||||
let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?;
|
||||
let embedder_category_id =
|
||||
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
||||
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
|
||||
|
||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
@ -255,6 +268,9 @@ impl Index {
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
vector_arroy,
|
||||
docid_vector_ids,
|
||||
embedder_category_id,
|
||||
documents,
|
||||
})
|
||||
}
|
||||
@ -477,63 +493,6 @@ impl Index {
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/* vector HNSW */
|
||||
|
||||
/// Writes the provided `hnsw`.
|
||||
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
|
||||
// We must delete all the chunks before we write the new HNSW chunks.
|
||||
self.delete_vector_hnsw(wtxn)?;
|
||||
|
||||
let chunk_size = 1024 * 1024 * (1024 + 512); // 1.5 GiB
|
||||
let bytes = bincode::serialize(hnsw).map_err(Into::into).map_err(heed::Error::Encoding)?;
|
||||
for (i, chunk) in bytes.chunks(chunk_size).enumerate() {
|
||||
let i = i as u32;
|
||||
let mut key = main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes().to_vec();
|
||||
key.extend_from_slice(&i.to_be_bytes());
|
||||
self.main.remap_types::<Bytes, Bytes>().put(wtxn, &key, chunk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete the `hnsw`.
|
||||
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
let mut iter = self
|
||||
.main
|
||||
.remap_types::<Bytes, DecodeIgnore>()
|
||||
.prefix_iter_mut(wtxn, main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes())?;
|
||||
let mut deleted = false;
|
||||
while iter.next().transpose()?.is_some() {
|
||||
// We do not keep a reference to the key or the value.
|
||||
unsafe { deleted |= iter.del_current()? };
|
||||
}
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
/// Returns the `hnsw`.
|
||||
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
|
||||
let mut slices = Vec::new();
|
||||
for result in self
|
||||
.main
|
||||
.remap_types::<Str, Bytes>()
|
||||
.prefix_iter(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)?
|
||||
{
|
||||
let (_, slice) = result?;
|
||||
slices.push(slice);
|
||||
}
|
||||
|
||||
if slices.is_empty() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let readable_slices: ReadableSlices<_> = slices.into_iter().collect();
|
||||
Ok(Some(
|
||||
bincode::deserialize_from(readable_slices)
|
||||
.map_err(Into::into)
|
||||
.map_err(heed::Error::Decoding)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/* field distribution */
|
||||
|
||||
/// Writes the field distribution which associates every field name with
|
||||
@ -1557,6 +1516,30 @@ impl Index {
|
||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_unavailable_vector_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
unavailable_vector_ids: RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, CboRoaringBitmapCodec>().put(
|
||||
wtxn,
|
||||
main_key::VECTOR_UNAVAILABLE_VECTOR_IDS,
|
||||
&unavailable_vector_ids,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)
|
||||
}
|
||||
|
||||
pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result<RoaringBitmap> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, CboRoaringBitmapCodec>()
|
||||
.get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -19,7 +19,6 @@ pub mod heed_codec;
|
||||
pub mod index;
|
||||
pub mod prompt;
|
||||
pub mod proximity;
|
||||
mod readable_slices;
|
||||
pub mod score_details;
|
||||
mod search;
|
||||
pub mod update;
|
||||
|
@ -1,85 +0,0 @@
|
||||
use std::io::{self, Read};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
pub struct ReadableSlices<A> {
|
||||
inner: Vec<A>,
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl<A> FromIterator<A> for ReadableSlices<A> {
|
||||
fn from_iter<T: IntoIterator<Item = A>>(iter: T) -> Self {
|
||||
ReadableSlices { inner: iter.into_iter().collect(), pos: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[u8]>> Read for ReadableSlices<A> {
|
||||
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
||||
let original_buf_len = buf.len();
|
||||
|
||||
// We explore the list of slices to find the one where we must start reading.
|
||||
let mut pos = self.pos;
|
||||
let index = match self
|
||||
.inner
|
||||
.iter()
|
||||
.map(|s| s.as_ref().len() as u64)
|
||||
.position(|size| pos.checked_sub(size).map(|p| pos = p).is_none())
|
||||
{
|
||||
Some(index) => index,
|
||||
None => return Ok(0),
|
||||
};
|
||||
|
||||
let mut inner_pos = pos as usize;
|
||||
for slice in &self.inner[index..] {
|
||||
let slice = &slice.as_ref()[inner_pos..];
|
||||
|
||||
if buf.len() > slice.len() {
|
||||
// We must exhaust the current slice and go to the next one there is not enough here.
|
||||
buf[..slice.len()].copy_from_slice(slice);
|
||||
buf = &mut buf[slice.len()..];
|
||||
inner_pos = 0;
|
||||
} else {
|
||||
// There is enough in this slice to fill the remaining bytes of the buffer.
|
||||
// Let's break just after filling it.
|
||||
buf.copy_from_slice(&slice[..buf.len()]);
|
||||
buf = &mut [];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let written = original_buf_len - buf.len();
|
||||
self.pos += written as u64;
|
||||
Ok(written)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::io::Read;
|
||||
|
||||
use super::ReadableSlices;
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let data: Vec<_> = (0..100).collect();
|
||||
let splits: Vec<_> = data.chunks(3).collect();
|
||||
let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
|
||||
|
||||
let mut output = Vec::new();
|
||||
let length = rdslices.read_to_end(&mut output).unwrap();
|
||||
assert_eq!(length, data.len());
|
||||
assert_eq!(output, data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_reads() {
|
||||
let data: Vec<_> = (0..u8::MAX).collect();
|
||||
let splits: Vec<_> = data.chunks(27).collect();
|
||||
let mut rdslices: ReadableSlices<_> = splits.into_iter().collect();
|
||||
|
||||
let buffer = &mut [0; 45];
|
||||
let length = rdslices.read(buffer).unwrap();
|
||||
let expected: Vec<_> = (0..buffer.len() as u8).collect();
|
||||
assert_eq!(length, buffer.len());
|
||||
assert_eq!(buffer, &expected[..]);
|
||||
}
|
||||
}
|
@ -11,64 +11,31 @@ use crate::index::Hnsw;
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::{Result, SearchContext, SearchLogger, UserError};
|
||||
|
||||
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||
pub struct VectorSort<'ctx, Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
target: Vec<f32>,
|
||||
vector_candidates: RoaringBitmap,
|
||||
scope: nolife::DynBoxScope<SearchFamily>,
|
||||
reader: arroy::Reader<'ctx, arroy::distances::DotProduct>,
|
||||
limit: usize,
|
||||
}
|
||||
|
||||
type Item<'a> = instant_distance::Item<'a, NDotProductPoint>;
|
||||
type SearchFut = Pin<Box<dyn Future<Output = nolife::Never>>>;
|
||||
|
||||
struct SearchFamily;
|
||||
impl<'a> nolife::Family<'a> for SearchFamily {
|
||||
type Family = Box<dyn Iterator<Item = Item<'a>> + 'a>;
|
||||
}
|
||||
|
||||
async fn search_scope(
|
||||
mut time_capsule: nolife::TimeCapsule<SearchFamily>,
|
||||
hnsw: Hnsw,
|
||||
target: Vec<f32>,
|
||||
) -> nolife::Never {
|
||||
let mut search = instant_distance::Search::default();
|
||||
let it = Box::new(hnsw.search(&NDotProductPoint::new(target), &mut search));
|
||||
let mut it: Box<dyn Iterator<Item = Item>> = it;
|
||||
loop {
|
||||
time_capsule.freeze(&mut it).await;
|
||||
}
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> VectorSort<'ctx, Q> {
|
||||
pub fn new(
|
||||
ctx: &SearchContext,
|
||||
ctx: &'ctx SearchContext,
|
||||
target: Vec<f32>,
|
||||
vector_candidates: RoaringBitmap,
|
||||
limit: usize,
|
||||
) -> Result<Self> {
|
||||
let hnsw =
|
||||
ctx.index.vector_hnsw(ctx.txn)?.unwrap_or(Hnsw::builder().build_hnsw(Vec::default()).0);
|
||||
|
||||
if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() {
|
||||
if target.len() != expected_size {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_size,
|
||||
found: target.len(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
/// FIXME? what to do in case of missing metadata
|
||||
let reader = arroy::Reader::open(ctx.txn, 0, ctx.index.vector_arroy)?;
|
||||
|
||||
let target_clone = target.clone();
|
||||
let producer = move |time_capsule| -> SearchFut {
|
||||
Box::pin(search_scope(time_capsule, hnsw, target_clone))
|
||||
};
|
||||
let scope = DynBoxScope::new(producer);
|
||||
|
||||
Ok(Self { query: None, target, vector_candidates, scope })
|
||||
Ok(Self { query: None, target, vector_candidates, reader, limit })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q> {
|
||||
fn id(&self) -> String {
|
||||
"vector_sort".to_owned()
|
||||
}
|
||||
@ -108,11 +75,11 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
|
||||
}),
|
||||
}));
|
||||
}
|
||||
|
||||
let scope = &mut self.scope;
|
||||
let target = &self.target;
|
||||
let vector_candidates = &self.vector_candidates;
|
||||
|
||||
let result = self.reader.nns_by_vector(ctx.txn, &target, count, search_k, candidates)
|
||||
|
||||
scope.enter(|it| {
|
||||
for item in it.by_ref() {
|
||||
let item: Item = item;
|
||||
|
@ -43,6 +43,9 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
vector_arroy,
|
||||
docid_vector_ids,
|
||||
embedder_category_id: _,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
@ -58,7 +61,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
@ -82,7 +84,11 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
// vector
|
||||
vector_arroy.clear(self.wtxn)?;
|
||||
vector_id_docid.clear(self.wtxn)?;
|
||||
docid_vector_ids.clear(self.wtxn)?;
|
||||
|
||||
documents.clear(self.wtxn)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
|
@ -312,7 +312,8 @@ fn send_original_documents_data(
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
/// FIXME: compute an expected dimension from the manual vectors if any
|
||||
expected_dimension: expected_dimension.unwrap(),
|
||||
manual_vectors,
|
||||
}))
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ use crossbeam_channel::{Receiver, Sender};
|
||||
use heed::types::Str;
|
||||
use heed::Database;
|
||||
use log::debug;
|
||||
use rand::SeedableRng;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use slice_group_by::GroupBy;
|
||||
@ -489,6 +490,9 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let writer = arroy::Writer::prepare(self.wtxn, self.index.vector_arroy, 0, 0)?;
|
||||
writer.build(self.wtxn, &mut rand::rngs::StdRng::from_entropy(), None)?;
|
||||
|
||||
// We write the field distribution into the main database
|
||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
@ -27,6 +27,7 @@ use crate::index::Hnsw;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||
use crate::update::{available_documents_ids, AvailableDocumentsIds};
|
||||
use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
@ -50,7 +51,7 @@ pub(crate) enum TypedChunk {
|
||||
VectorPoints {
|
||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
||||
expected_dimension: Option<usize>,
|
||||
expected_dimension: usize,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
},
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||
@ -106,7 +107,7 @@ impl TypedChunk {
|
||||
format!("GeoPoints {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension } => {
|
||||
format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension.unwrap_or_default())
|
||||
format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension)
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||
@ -373,46 +374,53 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
return Ok((RoaringBitmap::new(), is_merged_database));
|
||||
}
|
||||
|
||||
let mut docid_vectors_map: HashMap<DocumentId, HashSet<Vec<OrderedFloat<f32>>>> =
|
||||
HashMap::new();
|
||||
|
||||
// We extract and store the previous vectors
|
||||
if let Some(hnsw) = index.vector_hnsw(wtxn)? {
|
||||
for (pid, point) in hnsw.iter() {
|
||||
let pid_key = pid.into_inner();
|
||||
let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap();
|
||||
let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
|
||||
docid_vectors_map.entry(docid).or_default().insert(vector);
|
||||
}
|
||||
}
|
||||
let mut unavailable_vector_ids = index.unavailable_vector_ids(&wtxn)?;
|
||||
/// FIXME: allow customizing distance
|
||||
/// FIXME: allow customizing index
|
||||
let writer = arroy::Writer::prepare(wtxn, index.vector_arroy, 0, expected_dimension)?;
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let mut cursor = remove_vectors.into_cursor()?;
|
||||
while let Some((key, _)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
docid_vectors_map.remove(&docid);
|
||||
let Some(to_remove_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? else {
|
||||
continue;
|
||||
};
|
||||
unavailable_vector_ids -= to_remove_vector_ids;
|
||||
|
||||
for item in to_remove_vector_ids {
|
||||
writer.del_item(wtxn, item)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut available_vector_ids =
|
||||
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
|
||||
// add generated embeddings
|
||||
if let Some((embeddings, expected_dimension)) = embeddings.zip(expected_dimension) {
|
||||
if let Some(embeddings) = embeddings {
|
||||
let mut cursor = embeddings.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data: Vec<OrderedFloat<_>> =
|
||||
pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings =
|
||||
crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
|
||||
let mut set = HashSet::new();
|
||||
let mut new_vector_ids = RoaringBitmap::new();
|
||||
for embedding in embeddings.iter() {
|
||||
set.insert(embedding.to_vec());
|
||||
}
|
||||
/// FIXME: error when you get over 9000
|
||||
let next_vector_id = available_vector_ids.next().unwrap();
|
||||
unavailable_vector_ids.insert(next_vector_id);
|
||||
|
||||
docid_vectors_map.insert(docid, set);
|
||||
new_vector_ids.insert(next_vector_id);
|
||||
|
||||
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
|
||||
|
||||
writer.add_item(wtxn, next_vector_id, embedding)?;
|
||||
}
|
||||
index.docid_vector_ids.put(wtxn, &docid, &new_vector_ids)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -425,68 +433,44 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
|
||||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector: Vec<OrderedFloat<_>> =
|
||||
pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
docid_vectors_map.entry(docid).and_modify(|v| {
|
||||
if !v.remove(&vector) {
|
||||
error!("Unable to delete the vector: {:?}", vector);
|
||||
let vector = pod_collect_to_vec(value);
|
||||
let Some(mut docid_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)?
|
||||
else {
|
||||
error!("Unable to delete the vector: {:?}", vector);
|
||||
continue;
|
||||
};
|
||||
for item in docid_vector_ids {
|
||||
/// FIXME: comparing the vectors by equality is inefficient, and dangerous by perfect equality
|
||||
let candidate = writer.item_vector(&wtxn, item)?.expect("Inconsistent dbs");
|
||||
if candidate == vector {
|
||||
writer.del_item(wtxn, item)?;
|
||||
unavailable_vector_ids.remove(item);
|
||||
index.vector_id_docid.delete(wtxn, &item)?;
|
||||
docid_vector_ids.remove(item);
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
|
||||
docid_vectors_map.entry(docid).and_modify(|v| {
|
||||
v.insert(vector);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the most common vector dimension
|
||||
let expected_dimension_size = {
|
||||
let mut dims = HashMap::new();
|
||||
docid_vectors_map
|
||||
.values()
|
||||
.flat_map(|v| v.iter())
|
||||
.for_each(|v| *dims.entry(v.len()).or_insert(0) += 1);
|
||||
dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
|
||||
};
|
||||
|
||||
// Ensure that the vector lengths are correct and
|
||||
// prepare the vectors before inserting them in the HNSW.
|
||||
let mut points = Vec::new();
|
||||
let mut docids = Vec::new();
|
||||
for (docid, vector) in docid_vectors_map
|
||||
.into_iter()
|
||||
.flat_map(|(docid, vectors)| std::iter::repeat(docid).zip(vectors))
|
||||
{
|
||||
if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
|
||||
return Err(UserError::InvalidVectorDimensions {
|
||||
expected: expected_dimension_size.unwrap_or(vector.len()),
|
||||
found: vector.len(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
let vector = vector.into_iter().map(OrderedFloat::into_inner).collect();
|
||||
points.push(NDotProductPoint::new(vector));
|
||||
docids.push(docid);
|
||||
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
|
||||
}
|
||||
let mut available_vector_ids =
|
||||
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
|
||||
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
let vector = pod_collect_to_vec(value);
|
||||
let next_vector_id = available_vector_ids.next().unwrap();
|
||||
|
||||
writer.add_item(wtxn, next_vector_id, &vector)?;
|
||||
unavailable_vector_ids.insert(next_vector_id);
|
||||
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
|
||||
let mut docid_vector_ids =
|
||||
index.docid_vector_ids.get(&wtxn, &docid)?.unwrap_or_default();
|
||||
docid_vector_ids.insert(next_vector_id);
|
||||
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
|
||||
}
|
||||
}
|
||||
|
||||
let hnsw_length = points.len();
|
||||
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
|
||||
|
||||
assert_eq!(docids.len(), pids.len());
|
||||
|
||||
// Store the vectors in the point-docid relation database
|
||||
index.vector_id_docid.clear(wtxn)?;
|
||||
for (docid, pid) in docids.into_iter().zip(pids) {
|
||||
index.vector_id_docid.put(wtxn, &pid.into_inner(), &docid)?;
|
||||
}
|
||||
|
||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||
log::debug!("There are {} entries in the arroy so far", unavailable_vector_ids.len());
|
||||
index.put_unavailable_vector_ids(wtxn, unavailable_vector_ids)?;
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
for (key, (deletion, addition)) in sl_map {
|
||||
|
Loading…
x
Reference in New Issue
Block a user