From ea2f2ecf96dbfd03e419a7207e8c17202eeac03d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 17:49:50 +0200 Subject: [PATCH] create a new database containing all the documents that were geo-faceted --- milli/src/index.rs | 38 ++++++++++++++++++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 29 +++++++++++++- .../src/update/index_documents/typed_chunk.rs | 4 ++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 70aefa9be..f2ddba699 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -32,7 +32,8 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; - pub const GEO_RTREE_KEY: &str = "geo"; + pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; + pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -320,6 +321,41 @@ impl Index { } } + /* geo faceted */ + + /// Writes the documents ids that are faceted with a _geo field + pub(crate) fn put_geo_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Delete the documents ids that are faceted with a _geo field + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + &RoaringBitmap::new(), + ) + } + + /// Retrieve all the documents ids that faceted with a _geo field + pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + match self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? + { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* field distribution */ /// Writes the field distribution which associates every field name with diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ef91991e8..e937cb65f 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -49,6 +49,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; + self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -116,6 +117,7 @@ mod tests { assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); + assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 84fc3215f..cfd777d11 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -381,6 +381,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; + let points_to_remove: Vec<_> = rtree .iter() .filter(|&point| self.documents_ids.contains(point.data)) @@ -388,9 +390,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .collect(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); + geo_faceted_doc_ids.remove(point.data); }); self.index.put_geo_rtree(self.wtxn, &rtree)?; + self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } // We delete the documents ids that are under the facet field id values. @@ -555,6 +559,8 @@ where #[cfg(test)] mod tests { + use std::collections::HashSet; + use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -726,11 +732,30 @@ mod tests { let rtxn = index.read_txn().unwrap(); let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); - let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); + let all_geo_documents = index + .documents(&rtxn, all_geo_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); - for (id, _) in all_geo_documents.iter() { + let all_geo_faceted_ids = geo_faceted_doc_ids.iter().collect::>(); + let all_geo_faceted_documents = index + .documents(&rtxn, all_geo_faceted_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!( + all_geo_documents, all_geo_faceted_documents, + "There is an inconsistency between the geo_faceted database and the rtree" + ); + + for id in all_geo_documents.iter() { assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9605fea7d..b09bee213 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -182,6 +182,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::GeoPoints(mut geo_points) => { // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); + let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; + while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) let (key, _) = helpers::try_split_array_at::(key).unwrap(); @@ -192,8 +194,10 @@ pub(crate) fn write_typed_chunk_into_index( let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); + doc_ids.insert(key); } index.put_geo_rtree(wtxn, &rtree)?; + index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?; } }