diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index b03302ca1..0ed80dd92 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -139,7 +139,7 @@ mod test { let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); + let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c6b83eeb6..933f68837 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String { }); snap } +pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( + (field_id, doc_id, string), + other_string, + )| { + &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") + }); + snap +} pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let documents_ids = index.documents_ids(&rtxn).unwrap(); @@ -423,6 +432,9 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_string_docids) => {{ $crate::snapshot_tests::snap_facet_id_string_docids(&$index) }}; + ($index:ident, field_id_docid_facet_strings) => {{ + $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index df0b93839..a0d426d7a 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,8 +1,9 @@ +use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{Index, Result}; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -287,7 +288,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count(); - if size_highest_level < self.min_level_size { + if size_highest_level < self.group_size * self.min_level_size { return Ok(()); } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 0bb83c29a..fe42801e7 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -38,12 +38,13 @@ pub fn extract_facet_string_docids( let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); let normalised_value = std::str::from_utf8(normalized_value_bytes)?; let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1ab1bd38d..2a2511362 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; use crate::update::DeleteDocuments; - use crate::BEU16; + use crate::{db_snap, BEU16}; #[test] fn simple_document_replacement() { @@ -1379,6 +1379,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let hidden = index.faceted_fields(&rtxn).unwrap(); @@ -1399,6 +1418,15 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, field_id_docid_facet_strings, @""); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); @@ -1412,6 +1440,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7aa306183..df98724da 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; @@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::FacetsUpdateIncremental; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, + Index, Result, }; pub(crate) enum TypedChunk { @@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - append_entries_into_database( - facet_id_f64_docids_iter, - &index.facet_id_f64_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; + // merge cbo roaring bitmaps is not the correct merger because the data in the DB + // is FacetGroupValue and not RoaringBitmap + // so I need to create my own merging function + + // facet_id_string_docids is encoded as: + // key: FacetKeyCodec + // value: CboRoaringBitmapCodec + // basically + + // TODO: a condition saying "if I have more than 1/50th of the DB to add, + // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, + // it is a ratio I determine empirically + + // for now I only do it incrementally, to see if things work + let indexer = FacetsUpdateIncremental::new( + index.facet_id_f64_docids.remap_key_type::>(), + ); + + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = facet_id_f64_docids_iter.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let docids = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; + } + is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { @@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index( // it is a ratio I determine empirically // for now I only do it incrementally, to see if things work - let builder = FacetsUpdateIncremental::new( + let indexer = FacetsUpdateIncremental::new( index.facet_id_string_docids.remap_key_type::>(), ); + let mut new_faceted_docids = HashMap::::default(); + let mut cursor = facet_id_string_docids.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let key = FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let value = + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - builder.insert(wtxn, key.field_id, key.left_bound, &value)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; } is_merged_database = true; }