Fix some facet indexing bugs

This commit is contained in:
Loïc Lecrenier 2022-09-01 09:51:43 +02:00 committed by Loïc Lecrenier
parent 68cbcdf08b
commit 61252248fb
6 changed files with 115 additions and 18 deletions

View File

@ -139,7 +139,7 @@ mod test {
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 100);
let tempdir = tempfile::TempDir::new_in("databases/").unwrap();
let tempdir = tempfile::TempDir::new().unwrap();
let env = options.open(tempdir.path()).unwrap();
let content = env.create_database(None).unwrap();

View File

@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String {
});
snap
}
pub fn snap_field_id_docid_facet_strings(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |(
(field_id, doc_id, string),
other_string,
)| {
&format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}")
});
snap
}
pub fn snap_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let documents_ids = index.documents_ids(&rtxn).unwrap();
@ -423,6 +432,9 @@ macro_rules! full_snap_of_db {
($index:ident, facet_id_string_docids) => {{
$crate::snapshot_tests::snap_facet_id_string_docids(&$index)
}};
($index:ident, field_id_docid_facet_strings) => {{
$crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index)
}};
($index:ident, documents_ids) => {{
$crate::snapshot_tests::snap_documents_ids(&$index)
}};

View File

@ -1,8 +1,9 @@
use crate::facet::FacetType;
use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
};
use crate::search::facet::get_highest_level;
use crate::Result;
use crate::{Index, Result};
use heed::Error;
use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn};
use roaring::RoaringBitmap;
@ -287,7 +288,7 @@ impl FacetsUpdateIncremental {
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
.count();
if size_highest_level < self.min_level_size {
if size_highest_level < self.group_size * self.min_level_size {
return Ok(());
}

View File

@ -38,12 +38,13 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
let key = FacetKey { field_id, level: 0, left_bound: normalised_value };
let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?;
facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?;
}
sorter_into_reader(facet_string_docids_sorter, indexer)

View File

@ -592,7 +592,7 @@ mod tests {
use crate::index::tests::TempIndex;
use crate::search::TermsMatchingStrategy;
use crate::update::DeleteDocuments;
use crate::BEU16;
use crate::{db_snap, BEU16};
#[test]
fn simple_document_replacement() {
@ -1379,6 +1379,25 @@ mod tests {
})
.unwrap();
db_snap!(index, facet_id_string_docids, @r###"
3 0 first 1 [1, ]
3 0 second 1 [2, ]
3 0 third 1 [3, ]
3 0 zeroth 1 [0, ]
"###);
db_snap!(index, field_id_docid_facet_strings, @r###"
3 0 zeroth zeroth
3 1 first first
3 2 second second
3 3 third third
"###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
let hidden = index.faceted_fields(&rtxn).unwrap();
@ -1399,6 +1418,15 @@ mod tests {
})
.unwrap();
db_snap!(index, facet_id_string_docids, @"");
db_snap!(index, field_id_docid_facet_strings, @"");
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();
@ -1412,6 +1440,25 @@ mod tests {
})
.unwrap();
db_snap!(index, facet_id_string_docids, @r###"
3 0 first 1 [1, ]
3 0 second 1 [2, ]
3 0 third 1 [3, ]
3 0 zeroth 1 [0, ]
"###);
db_snap!(index, field_id_docid_facet_strings, @r###"
3 0 zeroth zeroth
3 1 first first
3 2 second second
3 3 third third
"###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
let facets = index.faceted_fields(&rtxn).unwrap();

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::File;
use std::io;
@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::update::FacetsUpdateIncremental;
use crate::{
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
Result,
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint,
Index, Result,
};
pub(crate) enum TypedChunk {
@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index(
is_merged_database = true;
}
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
append_entries_into_database(
facet_id_f64_docids_iter,
&index.facet_id_f64_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
// is FacetGroupValue and not RoaringBitmap
// so I need to create my own merging function
// facet_id_string_docids is encoded as:
// key: FacetKeyCodec<StrRefCodec>
// value: CboRoaringBitmapCodec
// basically
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
// it is a ratio I determine empirically
// for now I only do it incrementally, to see if things work
let indexer = FacetsUpdateIncremental::new(
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
);
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = facet_id_f64_docids_iter.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let key =
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
let docids =
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?;
docids |= new_docids;
index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?;
}
is_merged_database = true;
}
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index(
// it is a ratio I determine empirically
// for now I only do it incrementally, to see if things work
let builder = FacetsUpdateIncremental::new(
let indexer = FacetsUpdateIncremental::new(
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
);
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = facet_id_string_docids.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let key =
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
let value =
let docids =
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
builder.insert(wtxn, key.field_id, key.left_bound, &value)?;
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?;
docids |= new_docids;
index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?;
}
is_merged_database = true;
}