mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-05 11:21:51 +01:00
Store the vectors in an HNSW in LMDB
This commit is contained in:
parent
7ac2f1489d
commit
4571e512d2
63
Cargo.lock
generated
63
Cargo.lock
generated
@ -1221,6 +1221,12 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "doc-comment"
|
||||||
|
version = "0.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dump"
|
name = "dump"
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
@ -1725,6 +1731,15 @@ dependencies = [
|
|||||||
"byteorder",
|
"byteorder",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.11.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
||||||
|
dependencies = [
|
||||||
|
"ahash 0.7.6",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hashbrown"
|
name = "hashbrown"
|
||||||
version = "0.12.3"
|
version = "0.12.3"
|
||||||
@ -1826,6 +1841,22 @@ dependencies = [
|
|||||||
"digest",
|
"digest",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hnsw"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b9740ebf8769ec4ad6762cc951ba18f39bba6dfbc2fbbe46285f7539af79752"
|
||||||
|
dependencies = [
|
||||||
|
"ahash 0.7.6",
|
||||||
|
"hashbrown 0.11.2",
|
||||||
|
"libm",
|
||||||
|
"num-traits",
|
||||||
|
"rand_core",
|
||||||
|
"serde",
|
||||||
|
"smallvec",
|
||||||
|
"space",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "0.2.9"
|
version = "0.2.9"
|
||||||
@ -1956,7 +1987,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
"hashbrown",
|
"hashbrown 0.12.3",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -2057,7 +2088,7 @@ checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"cedarwood",
|
"cedarwood",
|
||||||
"fxhash",
|
"fxhash",
|
||||||
"hashbrown",
|
"hashbrown 0.12.3",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"phf",
|
"phf",
|
||||||
"phf_codegen",
|
"phf_codegen",
|
||||||
@ -2698,6 +2729,7 @@ dependencies = [
|
|||||||
"geoutils",
|
"geoutils",
|
||||||
"grenad",
|
"grenad",
|
||||||
"heed",
|
"heed",
|
||||||
|
"hnsw",
|
||||||
"insta",
|
"insta",
|
||||||
"itertools",
|
"itertools",
|
||||||
"json-depth-checker",
|
"json-depth-checker",
|
||||||
@ -2712,6 +2744,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"ordered-float",
|
"ordered-float",
|
||||||
"rand",
|
"rand",
|
||||||
|
"rand_pcg",
|
||||||
"rayon",
|
"rayon",
|
||||||
"roaring",
|
"roaring",
|
||||||
"rstar",
|
"rstar",
|
||||||
@ -2721,6 +2754,7 @@ dependencies = [
|
|||||||
"smallstr",
|
"smallstr",
|
||||||
"smallvec",
|
"smallvec",
|
||||||
"smartstring",
|
"smartstring",
|
||||||
|
"space",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"time",
|
"time",
|
||||||
@ -3273,6 +3307,16 @@ dependencies = [
|
|||||||
"getrandom",
|
"getrandom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_pcg"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rayon"
|
name = "rayon"
|
||||||
version = "1.7.0"
|
version = "1.7.0"
|
||||||
@ -3732,6 +3776,9 @@ name = "smallvec"
|
|||||||
version = "1.10.0"
|
version = "1.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "smartstring"
|
name = "smartstring"
|
||||||
@ -3754,6 +3801,16 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "space"
|
||||||
|
version = "0.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c5ab9701ae895386d13db622abf411989deff7109b13b46b6173bb4ce5c1d123"
|
||||||
|
dependencies = [
|
||||||
|
"doc-comment",
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "spin"
|
name = "spin"
|
||||||
version = "0.5.2"
|
version = "0.5.2"
|
||||||
@ -4405,7 +4462,7 @@ version = "0.16.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
|
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"hashbrown",
|
"hashbrown 0.12.3",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ license.workspace = true
|
|||||||
bimap = { version = "0.6.3", features = ["serde"] }
|
bimap = { version = "0.6.3", features = ["serde"] }
|
||||||
bincode = "1.3.3"
|
bincode = "1.3.3"
|
||||||
bstr = "1.4.0"
|
bstr = "1.4.0"
|
||||||
bytemuck = "1.13.1"
|
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
charabia = { version = "0.7.2", default-features = false }
|
charabia = { version = "0.7.2", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
@ -33,18 +33,21 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
|
|||||||
"lmdb",
|
"lmdb",
|
||||||
"sync-read-txn",
|
"sync-read-txn",
|
||||||
] }
|
] }
|
||||||
|
hnsw = { version = "0.11.0", features = ["serde1"] }
|
||||||
json-depth-checker = { path = "../json-depth-checker" }
|
json-depth-checker = { path = "../json-depth-checker" }
|
||||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||||
memmap2 = "0.5.10"
|
memmap2 = "0.5.10"
|
||||||
obkv = "0.2.0"
|
obkv = "0.2.0"
|
||||||
once_cell = "1.17.1"
|
once_cell = "1.17.1"
|
||||||
ordered-float = "3.6.0"
|
ordered-float = "3.6.0"
|
||||||
|
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||||
rayon = "1.7.0"
|
rayon = "1.7.0"
|
||||||
roaring = "0.10.1"
|
roaring = "0.10.1"
|
||||||
rstar = { version = "0.10.0", features = ["serde"] }
|
rstar = { version = "0.10.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
slice-group-by = "0.3.0"
|
slice-group-by = "0.3.0"
|
||||||
|
space = "0.17.0"
|
||||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||||
smallvec = "1.10.0"
|
smallvec = "1.10.0"
|
||||||
smartstring = "1.0.1"
|
smartstring = "1.0.1"
|
||||||
|
16
milli/src/dot_product.rs
Normal file
16
milli/src/dot_product.rs
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use space::Metric;
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||||
|
pub struct DotProduct;
|
||||||
|
|
||||||
|
impl Metric<Vec<f32>> for DotProduct {
|
||||||
|
type Unit = u32;
|
||||||
|
|
||||||
|
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
|
||||||
|
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||||
|
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
|
||||||
|
debug_assert!(!dist.is_nan());
|
||||||
|
dist.to_bits()
|
||||||
|
}
|
||||||
|
}
|
@ -8,10 +8,12 @@ use charabia::{Language, Script};
|
|||||||
use heed::flags::Flags;
|
use heed::flags::Flags;
|
||||||
use heed::types::*;
|
use heed::types::*;
|
||||||
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
|
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
|
||||||
|
use rand_pcg::Pcg32;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use rstar::RTree;
|
use rstar::RTree;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
use crate::dot_product::DotProduct;
|
||||||
use crate::error::{InternalError, UserError};
|
use crate::error::{InternalError, UserError};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::fields_ids_map::FieldsIdsMap;
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
@ -26,6 +28,9 @@ use crate::{
|
|||||||
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
|
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// The HNSW data-structure that we serialize, fill and search in.
|
||||||
|
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
|
||||||
|
|
||||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||||
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
|
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
|
||||||
|
|
||||||
@ -42,6 +47,7 @@ pub mod main_key {
|
|||||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||||
|
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
|
||||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||||
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
||||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||||
@ -86,6 +92,7 @@ pub mod db_name {
|
|||||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
|
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||||
pub const DOCUMENTS: &str = "documents";
|
pub const DOCUMENTS: &str = "documents";
|
||||||
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
||||||
}
|
}
|
||||||
@ -149,6 +156,9 @@ pub struct Index {
|
|||||||
/// Maps the document id, the facet field id and the strings.
|
/// Maps the document id, the facet field id and the strings.
|
||||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||||
|
|
||||||
|
/// Maps a vector id to the document id that have it.
|
||||||
|
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
|
||||||
|
|
||||||
/// Maps the document id to the document as an obkv store.
|
/// Maps the document id to the document as an obkv store.
|
||||||
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||||
}
|
}
|
||||||
@ -162,7 +172,7 @@ impl Index {
|
|||||||
) -> Result<Index> {
|
) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(23);
|
options.max_dbs(24);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -198,11 +208,11 @@ impl Index {
|
|||||||
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
|
||||||
let facet_id_is_empty_docids =
|
let facet_id_is_empty_docids =
|
||||||
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
||||||
|
|
||||||
let field_id_docid_facet_f64s =
|
let field_id_docid_facet_f64s =
|
||||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||||
let field_id_docid_facet_strings =
|
let field_id_docid_facet_strings =
|
||||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||||
|
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
||||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
|
|
||||||
@ -231,6 +241,7 @@ impl Index {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
|
vector_id_docid,
|
||||||
documents,
|
documents,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -502,6 +513,26 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* vector HNSW */
|
||||||
|
|
||||||
|
/// Writes the provided `hnsw`.
|
||||||
|
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
|
||||||
|
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete the `hnsw`.
|
||||||
|
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||||
|
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `hnsw`.
|
||||||
|
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
|
||||||
|
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
|
||||||
|
Some(hnsw) => Ok(Some(hnsw)),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* field distribution */
|
/* field distribution */
|
||||||
|
|
||||||
/// Writes the field distribution which associates every field name with
|
/// Writes the field distribution which associates every field name with
|
||||||
@ -1466,9 +1497,9 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
db_snap!(index, field_distribution,
|
db_snap!(index, field_distribution,
|
||||||
@r###"
|
@r###"
|
||||||
age 1 |
|
age 1
|
||||||
id 2 |
|
id 2
|
||||||
name 2 |
|
name 2
|
||||||
"###
|
"###
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -1486,9 +1517,9 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
db_snap!(index, field_distribution,
|
db_snap!(index, field_distribution,
|
||||||
@r###"
|
@r###"
|
||||||
age 1 |
|
age 1
|
||||||
id 2 |
|
id 2
|
||||||
name 2 |
|
name 2
|
||||||
"###
|
"###
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -1502,9 +1533,9 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
db_snap!(index, field_distribution,
|
db_snap!(index, field_distribution,
|
||||||
@r###"
|
@r###"
|
||||||
has_dog 1 |
|
has_dog 1
|
||||||
id 2 |
|
id 2
|
||||||
name 2 |
|
name 2
|
||||||
"###
|
"###
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ pub mod documents;
|
|||||||
|
|
||||||
mod asc_desc;
|
mod asc_desc;
|
||||||
mod criterion;
|
mod criterion;
|
||||||
|
pub mod dot_product;
|
||||||
mod error;
|
mod error;
|
||||||
mod external_documents_ids;
|
mod external_documents_ids;
|
||||||
pub mod facet;
|
pub mod facet;
|
||||||
|
@ -39,6 +39,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
|
vector_id_docid,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
|
|
||||||
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||||
self.index.delete_geo_rtree(self.wtxn)?;
|
self.index.delete_geo_rtree(self.wtxn)?;
|
||||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||||
|
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||||
|
|
||||||
// We clean all the faceted documents ids.
|
// We clean all the faceted documents ids.
|
||||||
for field_id in faceted_fields {
|
for field_id in faceted_fields {
|
||||||
@ -95,6 +97,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
facet_id_string_docids.clear(self.wtxn)?;
|
facet_id_string_docids.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||||
|
vector_id_docid.clear(self.wtxn)?;
|
||||||
documents.clear(self.wtxn)?;
|
documents.clear(self.wtxn)?;
|
||||||
|
|
||||||
Ok(number_of_documents)
|
Ok(number_of_documents)
|
||||||
|
@ -240,6 +240,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
facet_id_exists_docids,
|
facet_id_exists_docids,
|
||||||
facet_id_is_null_docids,
|
facet_id_is_null_docids,
|
||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
|
vector_id_docid,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
// Remove from the documents database
|
// Remove from the documents database
|
||||||
|
@ -4,10 +4,12 @@ use std::convert::TryInto;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
use grenad::MergerBuilder;
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::RwTxn;
|
use heed::RwTxn;
|
||||||
|
use hnsw::Searcher;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
@ -17,7 +19,7 @@ use super::{ClonableMmap, MergeFn};
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
|
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
@ -223,27 +225,19 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||||
}
|
}
|
||||||
TypedChunk::VectorPoints(vector_points) => {
|
TypedChunk::VectorPoints(vector_points) => {
|
||||||
// let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
|
||||||
// let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
let mut searcher = Searcher::new();
|
||||||
|
|
||||||
// let mut cursor = geo_points.into_cursor()?;
|
let mut cursor = vector_points.into_cursor()?;
|
||||||
// while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
// // convert the key back to a u32 (4 bytes)
|
// convert the key back to a u32 (4 bytes)
|
||||||
// let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
// convert the vector back to a Vec<f32>
|
||||||
// // convert the latitude and longitude back to a f64 (8 bytes)
|
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||||
// let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
|
||||||
// let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
|
||||||
// let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
}
|
||||||
// let xyz_point = lat_lng_to_xyz(&point);
|
index.put_vector_hnsw(wtxn, &hnsw)?;
|
||||||
|
|
||||||
// rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
|
|
||||||
// geo_faceted_docids.insert(docid);
|
|
||||||
// }
|
|
||||||
// index.put_geo_rtree(wtxn, &rtree)?;
|
|
||||||
// index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
|
||||||
|
|
||||||
todo!("index vector points")
|
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user