create a new database containing all the documents that were geo-faceted

This commit is contained in:
Irevoire 2021-08-26 17:49:50 +02:00 committed by Tamo
parent 4b459768a0
commit ea2f2ecf96
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
4 changed files with 70 additions and 3 deletions

View File

@ -32,7 +32,8 @@ pub mod main_key {
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const GEO_RTREE_KEY: &str = "geo"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -320,6 +321,41 @@ impl Index {
} }
} }
/* geo faceted */
/// Writes the documents ids that are faceted with a _geo field
pub(crate) fn put_geo_faceted_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::GEO_FACETED_DOCUMENTS_IDS_KEY,
docids,
)
}
/// Delete the documents ids that are faceted with a _geo field
pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::GEO_FACETED_DOCUMENTS_IDS_KEY,
&RoaringBitmap::new(),
)
}
/// Retrieve all the documents ids that faceted with a _geo field
pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
match self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)?
{
Some(docids) => Ok(docids),
None => Ok(RoaringBitmap::new()),
}
}
/* field distribution */ /* field distribution */
/// Writes the field distribution which associates every field name with /// Writes the field distribution which associates every field name with

View File

@ -49,6 +49,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// We clean all the faceted documents ids. // We clean all the faceted documents ids.
let empty = RoaringBitmap::default(); let empty = RoaringBitmap::default();
@ -116,6 +117,7 @@ mod tests {
assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty());
assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty());
assert!(index.geo_rtree(&rtxn).unwrap().is_none()); assert!(index.geo_rtree(&rtxn).unwrap().is_none());
assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty());
assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());

View File

@ -381,6 +381,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
let points_to_remove: Vec<_> = rtree let points_to_remove: Vec<_> = rtree
.iter() .iter()
.filter(|&point| self.documents_ids.contains(point.data)) .filter(|&point| self.documents_ids.contains(point.data))
@ -388,9 +390,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
.collect(); .collect();
points_to_remove.iter().for_each(|point| { points_to_remove.iter().for_each(|point| {
rtree.remove(&point); rtree.remove(&point);
geo_faceted_doc_ids.remove(point.data);
}); });
self.index.put_geo_rtree(self.wtxn, &rtree)?; self.index.put_geo_rtree(self.wtxn, &rtree)?;
self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?;
} }
// We delete the documents ids that are under the facet field id values. // We delete the documents ids that are under the facet field id values.
@ -555,6 +559,8 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::HashSet;
use big_s::S; use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashset; use maplit::hashset;
@ -726,11 +732,30 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); let rtree = index.geo_rtree(&rtxn).unwrap().unwrap();
let geo_faceted_doc_ids = index.geo_faceted_documents_ids(&rtxn).unwrap();
let all_geo_ids = rtree.iter().map(|point| point.data).collect::<Vec<_>>(); let all_geo_ids = rtree.iter().map(|point| point.data).collect::<Vec<_>>();
let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); let all_geo_documents = index
.documents(&rtxn, all_geo_ids.iter().copied())
.unwrap()
.iter()
.map(|(id, _)| *id)
.collect::<HashSet<_>>();
for (id, _) in all_geo_documents.iter() { let all_geo_faceted_ids = geo_faceted_doc_ids.iter().collect::<Vec<_>>();
let all_geo_faceted_documents = index
.documents(&rtxn, all_geo_faceted_ids.iter().copied())
.unwrap()
.iter()
.map(|(id, _)| *id)
.collect::<HashSet<_>>();
assert_eq!(
all_geo_documents, all_geo_faceted_documents,
"There is an inconsistency between the geo_faceted database and the rtree"
);
for id in all_geo_documents.iter() {
assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id);
} }

View File

@ -182,6 +182,8 @@ pub(crate) fn write_typed_chunk_into_index(
TypedChunk::GeoPoints(mut geo_points) => { TypedChunk::GeoPoints(mut geo_points) => {
// TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?;
while let Some((key, value)) = geo_points.next()? { while let Some((key, value)) = geo_points.next()? {
// convert the key back to a u32 (4 bytes) // convert the key back to a u32 (4 bytes)
let (key, _) = helpers::try_split_array_at::<u8, 4>(key).unwrap(); let (key, _) = helpers::try_split_array_at::<u8, 4>(key).unwrap();
@ -192,8 +194,10 @@ pub(crate) fn write_typed_chunk_into_index(
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
rtree.insert(GeoPoint::new(point, key)); rtree.insert(GeoPoint::new(point, key));
doc_ids.insert(key);
} }
index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_rtree(wtxn, &rtree)?;
index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?;
} }
} }