Store the vectors in an HNSW in LMDB

This commit is contained in:
Clément Renault 2023-06-08 12:19:06 +02:00
parent 7ac2f1489d
commit 4571e512d2
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
8 changed files with 142 additions and 36 deletions

View file

@ -39,6 +39,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
} = self.index;
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
@ -95,6 +97,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
documents.clear(self.wtxn)?;
Ok(number_of_documents)

View file

@ -240,6 +240,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
vector_id_docid,
documents,
} = self.index;
// Remove from the documents database

View file

@ -4,10 +4,12 @@ use std::convert::TryInto;
use std::fs::File;
use std::io;
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap;
use super::helpers::{
@ -17,7 +19,7 @@ use super::{ClonableMmap, MergeFn};
use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -223,27 +225,19 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
// let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
// let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
// let mut cursor = geo_points.into_cursor()?;
// while let Some((key, value)) = cursor.move_on_next()? {
// // convert the key back to a u32 (4 bytes)
// let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// // convert the latitude and longitude back to a f64 (8 bytes)
// let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
// let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
// let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
// let xyz_point = lat_lng_to_xyz(&point);
// rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
// geo_faceted_docids.insert(docid);
// }
// index.put_geo_rtree(wtxn, &rtree)?;
// index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
todo!("index vector points")
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// convert the vector back to a Vec<f32>
let vector: Vec<f32> = pod_collect_to_vec(value);
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
}
index.put_vector_hnsw(wtxn, &hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();