mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Move to the hgg crate
This commit is contained in:
parent
642b0f3a1b
commit
268a9ef416
9 changed files with 73 additions and 99 deletions
|
@ -18,3 +18,17 @@ impl Metric<Vec<f32>> for DotProduct {
|
|||
dist.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Euclidean;
|
||||
|
||||
impl Metric<Vec<f32>> for Euclidean {
|
||||
type Unit = u32;
|
||||
|
||||
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
|
||||
let dist = squared.sqrt();
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist.to_bits()
|
||||
}
|
||||
}
|
|
@ -8,12 +8,11 @@ use charabia::{Language, Script};
|
|||
use heed::flags::Flags;
|
||||
use heed::types::*;
|
||||
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
|
||||
use rand_pcg::Pcg32;
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::dot_product::DotProduct;
|
||||
use crate::distance::Euclidean;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
|
@ -28,8 +27,8 @@ use crate::{
|
|||
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
|
||||
};
|
||||
|
||||
/// The HNSW data-structure that we serialize, fill and search in.
|
||||
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
|
||||
/// The HGG data-structure that we serialize, fill and search in.
|
||||
pub type Hgg = hgg::Hgg<Euclidean, Vec<f32>, DocumentId>;
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
|
||||
|
@ -47,7 +46,7 @@ pub mod main_key {
|
|||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
|
||||
pub const GEO_RTREE_KEY: &str = "geo-rtree";
|
||||
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
|
||||
pub const VECTOR_HGG_KEY: &str = "vector-hgg";
|
||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
|
@ -92,7 +91,6 @@ pub mod db_name {
|
|||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||
pub const DOCUMENTS: &str = "documents";
|
||||
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
|
||||
}
|
||||
|
@ -156,9 +154,6 @@ pub struct Index {
|
|||
/// Maps the document id, the facet field id and the strings.
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps a vector id to the document id that have it.
|
||||
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
|
||||
|
||||
/// Maps the document id to the document as an obkv store.
|
||||
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||
}
|
||||
|
@ -172,7 +167,7 @@ impl Index {
|
|||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(24);
|
||||
options.max_dbs(23);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
|
@ -212,7 +207,6 @@ impl Index {
|
|||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
|
||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
|
@ -241,7 +235,6 @@ impl Index {
|
|||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
})
|
||||
}
|
||||
|
@ -513,22 +506,22 @@ impl Index {
|
|||
}
|
||||
}
|
||||
|
||||
/* vector HNSW */
|
||||
/* vector HGG */
|
||||
|
||||
/// Writes the provided `hnsw`.
|
||||
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
|
||||
/// Writes the provided `hgg`.
|
||||
pub(crate) fn put_vector_hgg(&self, wtxn: &mut RwTxn, hgg: &Hgg) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, SerdeBincode<Hgg>>(wtxn, main_key::VECTOR_HGG_KEY, hgg)
|
||||
}
|
||||
|
||||
/// Delete the `hnsw`.
|
||||
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
|
||||
/// Delete the `hgg`.
|
||||
pub(crate) fn delete_vector_hgg(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
|
||||
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HGG_KEY)
|
||||
}
|
||||
|
||||
/// Returns the `hnsw`.
|
||||
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
|
||||
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
|
||||
Some(hnsw) => Ok(Some(hnsw)),
|
||||
/// Returns the `hgg`.
|
||||
pub fn vector_hgg(&self, rtxn: &RoTxn) -> Result<Option<Hgg>> {
|
||||
match self.main.get::<_, Str, SerdeBincode<Hgg>>(rtxn, main_key::VECTOR_HGG_KEY)? {
|
||||
Some(hgg) => Ok(Some(hgg)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ pub mod documents;
|
|||
|
||||
mod asc_desc;
|
||||
mod criterion;
|
||||
pub mod dot_product;
|
||||
mod distance;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
|
|
|
@ -28,7 +28,6 @@ use db_cache::DatabaseCache;
|
|||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use hnsw::Searcher;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
|
@ -40,7 +39,7 @@ use ranking_rules::{
|
|||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
|
||||
use roaring::RoaringBitmap;
|
||||
use sort::Sort;
|
||||
use space::Neighbor;
|
||||
use space::{KnnMap, Neighbor};
|
||||
|
||||
use self::geo_sort::GeoSort;
|
||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
|
@ -48,9 +47,7 @@ use self::graph_based_ranking_rule::Words;
|
|||
use self::interner::Interned;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
|
||||
};
|
||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
|
@ -450,26 +447,15 @@ pub fn execute_search(
|
|||
let docids = match vector {
|
||||
Some(vector) => {
|
||||
// return the nearest documents that are also part of the candidates.
|
||||
let mut searcher = Searcher::new();
|
||||
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
|
||||
let ef = hnsw.len().min(100);
|
||||
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
|
||||
let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]);
|
||||
|
||||
let mut docids = Vec::new();
|
||||
for Neighbor { index, distance } in neighbors.iter() {
|
||||
let index = BEU32::new(*index as u32);
|
||||
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
|
||||
dbg!(distance, f32::from_bits(*distance));
|
||||
if universe.contains(docid) {
|
||||
docids.push(docid);
|
||||
if docids.len() == length {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docids
|
||||
let hgg = ctx.index.vector_hgg(ctx.txn)?.unwrap_or_default();
|
||||
hgg.knn_values(&vector, 100)
|
||||
.into_iter()
|
||||
.filter(|(Neighbor { distance, .. }, docid)| {
|
||||
dbg!(distance, f32::from_bits(*distance));
|
||||
universe.contains(**docid)
|
||||
})
|
||||
.map(|(_, docid)| *docid)
|
||||
.collect()
|
||||
}
|
||||
// return the search docids if the vector field is not specified
|
||||
None => docids,
|
||||
|
|
|
@ -39,7 +39,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
|
@ -58,7 +57,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
self.index.delete_vector_hgg(self.wtxn)?;
|
||||
|
||||
// We clean all the faceted documents ids.
|
||||
for field_id in faceted_fields {
|
||||
|
@ -97,7 +96,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
vector_id_docid.clear(self.wtxn)?;
|
||||
documents.clear(self.wtxn)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
|
|
|
@ -240,7 +240,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
vector_id_docid,
|
||||
documents,
|
||||
} = self.index;
|
||||
// Remove from the documents database
|
||||
|
@ -275,6 +274,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
&mut words_to_delete,
|
||||
)?;
|
||||
|
||||
todo!("delete the documents from the Hgg datastructure");
|
||||
|
||||
// We construct an FST set that contains the words to delete from the words FST.
|
||||
let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
|
||||
|
||||
|
|
|
@ -9,8 +9,8 @@ use charabia::{Language, Script};
|
|||
use grenad::MergerBuilder;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::RwTxn;
|
||||
use hnsw::Searcher;
|
||||
use roaring::RoaringBitmap;
|
||||
use space::KnnInsert;
|
||||
|
||||
use super::helpers::{
|
||||
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||
|
@ -19,7 +19,7 @@ use super::{ClonableMmap, MergeFn};
|
|||
use crate::facet::FacetType;
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
|
@ -225,19 +225,16 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints(vector_points) => {
|
||||
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
|
||||
let mut searcher = Searcher::new();
|
||||
|
||||
let mut hgg = index.vector_hgg(wtxn)?.unwrap_or_default();
|
||||
let mut cursor = vector_points.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
|
||||
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
|
||||
// convert the vector back to a Vec<f32> and insert it.
|
||||
// TODO enable again when the library is fixed
|
||||
hgg.insert(pod_collect_to_vec(value), docid);
|
||||
}
|
||||
index.put_vector_hnsw(wtxn, &hnsw)?;
|
||||
index.put_vector_hgg(wtxn, &hgg)?;
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||
let mut buffer = Vec::new();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue