From 268a9ef416206b61b49dbecf881987223fea9f74 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 13 Jun 2023 15:19:01 +0200 Subject: [PATCH] Move to the hgg crate --- Cargo.lock | 54 +++++++------------ milli/Cargo.toml | 3 +- milli/src/{dot_product.rs => distance.rs} | 14 +++++ milli/src/index.rs | 39 ++++++-------- milli/src/lib.rs | 2 +- milli/src/search/new/mod.rs | 36 ++++--------- milli/src/update/clear_documents.rs | 4 +- milli/src/update/delete_documents.rs | 3 +- .../src/update/index_documents/typed_chunk.rs | 17 +++--- 9 files changed, 73 insertions(+), 99 deletions(-) rename milli/src/{dot_product.rs => distance.rs} (63%) diff --git a/Cargo.lock b/Cargo.lock index 904d1c225..f2fe02366 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1736,9 +1736,6 @@ name = "hashbrown" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash 0.7.6", -] [[package]] name = "hashbrown" @@ -1749,6 +1746,12 @@ dependencies = [ "ahash 0.7.6", ] +[[package]] +name = "header-vec" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda7e66d32131841c4264e34a32c934df0dedb08d737f861326d616d4338f06f" + [[package]] name = "heapless" version = "0.7.16" @@ -1832,6 +1835,19 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hgg" +version = "0.4.2-alpha.0" +source = "git+https://github.com/rust-cv/hgg#6d1eacde635158163fb663d9327a2d6f612dd435" +dependencies = [ + "ahash 0.7.6", + "hashbrown 0.11.2", + "header-vec", + "num-traits", + "serde", + "space", +] + [[package]] name = "hmac" version = "0.12.1" @@ -1841,22 +1857,6 @@ dependencies = [ "digest", ] -[[package]] -name = "hnsw" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b9740ebf8769ec4ad6762cc951ba18f39bba6dfbc2fbbe46285f7539af79752" -dependencies = [ - "ahash 0.7.6", - "hashbrown 0.11.2", - "libm", - "num-traits", - "rand_core", - "serde", - "smallvec", - "space", -] - [[package]] name = "http" version = "0.2.9" @@ -2729,7 +2729,7 @@ dependencies = [ "geoutils", "grenad", "heed", - "hnsw", + "hgg", "insta", "itertools", "json-depth-checker", @@ -2744,7 +2744,6 @@ dependencies = [ "once_cell", "ordered-float", "rand", - "rand_pcg", "rayon", "roaring", "rstar", @@ -3307,16 +3306,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "rand_pcg" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e" -dependencies = [ - "rand_core", - "serde", -] - [[package]] name = "rayon" version = "1.7.0" @@ -3776,9 +3765,6 @@ name = "smallvec" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" -dependencies = [ - "serde", -] [[package]] name = "smartstring" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 08f0c2645..c17d100f5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -33,14 +33,13 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f "lmdb", "sync-read-txn", ] } -hnsw = { version = "0.11.0", features = ["serde1"] } +hgg = { git = "https://github.com/rust-cv/hgg", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.10" obkv = "0.2.0" once_cell = "1.17.1" ordered-float = "3.6.0" -rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.7.0" roaring = "0.10.1" rstar = { version = "0.10.0", features = ["serde"] } diff --git a/milli/src/dot_product.rs b/milli/src/distance.rs similarity index 63% rename from milli/src/dot_product.rs rename to milli/src/distance.rs index 86dd2f1d4..c26a745a4 100644 --- a/milli/src/dot_product.rs +++ b/milli/src/distance.rs @@ -18,3 +18,17 @@ impl Metric> for DotProduct { dist.to_bits() } } + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +pub struct Euclidean; + +impl Metric> for Euclidean { + type Unit = u32; + + fn distance(&self, a: &Vec, b: &Vec) -> Self::Unit { + let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum(); + let dist = squared.sqrt(); + debug_assert!(!dist.is_nan()); + dist.to_bits() + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 4cdfb010c..e29c6da22 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -8,12 +8,11 @@ use charabia::{Language, Script}; use heed::flags::Flags; use heed::types::*; use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; -use rand_pcg::Pcg32; use roaring::RoaringBitmap; use rstar::RTree; use time::OffsetDateTime; -use crate::dot_product::DotProduct; +use crate::distance::Euclidean; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -28,8 +27,8 @@ use crate::{ Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, }; -/// The HNSW data-structure that we serialize, fill and search in. -pub type Hnsw = hnsw::Hnsw, Pcg32, 12, 24>; +/// The HGG data-structure that we serialize, fill and search in. +pub type Hgg = hgg::Hgg, DocumentId>; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; @@ -47,7 +46,7 @@ pub mod main_key { pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; - pub const VECTOR_HNSW_KEY: &str = "vector-hnsw"; + pub const VECTOR_HGG_KEY: &str = "vector-hgg"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -92,7 +91,6 @@ pub mod db_name { pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; - pub const VECTOR_ID_DOCID: &str = "vector-id-docids"; pub const DOCUMENTS: &str = "documents"; pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } @@ -156,9 +154,6 @@ pub struct Index { /// Maps the document id, the facet field id and the strings. pub field_id_docid_facet_strings: Database, - /// Maps a vector id to the document id that have it. - pub vector_id_docid: Database, OwnedType>, - /// Maps the document id to the document as an obkv store. pub(crate) documents: Database, ObkvCodec>, } @@ -172,7 +167,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(24); + options.max_dbs(23); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -212,7 +207,6 @@ impl Index { env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?; - let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; wtxn.commit()?; @@ -241,7 +235,6 @@ impl Index { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_id_docid, documents, }) } @@ -513,22 +506,22 @@ impl Index { } } - /* vector HNSW */ + /* vector HGG */ - /// Writes the provided `hnsw`. - pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw) + /// Writes the provided `hgg`. + pub(crate) fn put_vector_hgg(&self, wtxn: &mut RwTxn, hgg: &Hgg) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode>(wtxn, main_key::VECTOR_HGG_KEY, hgg) } - /// Delete the `hnsw`. - pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY) + /// Delete the `hgg`. + pub(crate) fn delete_vector_hgg(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HGG_KEY) } - /// Returns the `hnsw`. - pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result> { - match self.main.get::<_, Str, SerdeBincode>(rtxn, main_key::VECTOR_HNSW_KEY)? { - Some(hnsw) => Ok(Some(hnsw)), + /// Returns the `hgg`. + pub fn vector_hgg(&self, rtxn: &RoTxn) -> Result> { + match self.main.get::<_, Str, SerdeBincode>(rtxn, main_key::VECTOR_HGG_KEY)? { + Some(hgg) => Ok(Some(hgg)), None => Ok(None), } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 2e62e35ac..4c7428fa8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -10,7 +10,7 @@ pub mod documents; mod asc_desc; mod criterion; -pub mod dot_product; +mod distance; mod error; mod external_documents_ids; pub mod facet; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 948a2fa21..f1aa21484 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,7 +28,6 @@ use db_cache::DatabaseCache; use exact_attribute::ExactAttribute; use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; use heed::RoTxn; -use hnsw::Searcher; use interner::{DedupInterner, Interner}; pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; @@ -40,7 +39,7 @@ use ranking_rules::{ use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use roaring::RoaringBitmap; use sort::Sort; -use space::Neighbor; +use space::{KnnMap, Neighbor}; use self::geo_sort::GeoSort; pub use self::geo_sort::Strategy as GeoSortStrategy; @@ -48,9 +47,7 @@ use self::graph_based_ranking_rule::Words; use self::interner::Interned; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; -use crate::{ - AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32, -}; +use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -450,26 +447,15 @@ pub fn execute_search( let docids = match vector { Some(vector) => { // return the nearest documents that are also part of the candidates. - let mut searcher = Searcher::new(); - let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default(); - let ef = hnsw.len().min(100); - let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef]; - let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]); - - let mut docids = Vec::new(); - for Neighbor { index, distance } in neighbors.iter() { - let index = BEU32::new(*index as u32); - let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get(); - dbg!(distance, f32::from_bits(*distance)); - if universe.contains(docid) { - docids.push(docid); - if docids.len() == length { - break; - } - } - } - - docids + let hgg = ctx.index.vector_hgg(ctx.txn)?.unwrap_or_default(); + hgg.knn_values(&vector, 100) + .into_iter() + .filter(|(Neighbor { distance, .. }, docid)| { + dbg!(distance, f32::from_bits(*distance)); + universe.contains(**docid) + }) + .map(|(_, docid)| *docid) + .collect() } // return the search docids if the vector field is not specified None => docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f4a2d43fe..e5e7f5491 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -39,7 +39,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_id_docid, documents, } = self.index; @@ -58,7 +57,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; - self.index.delete_vector_hnsw(self.wtxn)?; + self.index.delete_vector_hgg(self.wtxn)?; // We clean all the faceted documents ids. for field_id in faceted_fields { @@ -97,7 +96,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; - vector_id_docid.clear(self.wtxn)?; documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 73af66a95..890c2b329 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -240,7 +240,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_exists_docids, facet_id_is_null_docids, facet_id_is_empty_docids, - vector_id_docid, documents, } = self.index; // Remove from the documents database @@ -275,6 +274,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &mut words_to_delete, )?; + todo!("delete the documents from the Hgg datastructure"); + // We construct an FST set that contains the words to delete from the words FST. let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e2c67044c..82c02375c 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -9,8 +9,8 @@ use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::RwTxn; -use hnsw::Searcher; use roaring::RoaringBitmap; +use space::KnnInsert; use super::helpers::{ self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, @@ -19,7 +19,7 @@ use super::{ClonableMmap, MergeFn}; use crate::facet::FacetType; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; +use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), @@ -225,19 +225,16 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } TypedChunk::VectorPoints(vector_points) => { - let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default(); - let mut searcher = Searcher::new(); - + let mut hgg = index.vector_hgg(wtxn)?.unwrap_or_default(); let mut cursor = vector_points.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - // convert the vector back to a Vec - let vector: Vec = pod_collect_to_vec(value); - let vector_id = hnsw.insert(vector, &mut searcher) as u32; - index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?; + // convert the vector back to a Vec and insert it. + // TODO enable again when the library is fixed + hgg.insert(pod_collect_to_vec(value), docid); } - index.put_vector_hnsw(wtxn, &hnsw)?; + index.put_vector_hgg(wtxn, &hgg)?; } TypedChunk::ScriptLanguageDocids(hash_pair) => { let mut buffer = Vec::new();