mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Fix some facet indexing bugs
This commit is contained in:
parent
68cbcdf08b
commit
61252248fb
@ -139,7 +139,7 @@ mod test {
|
||||
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
|
||||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 100);
|
||||
let tempdir = tempfile::TempDir::new_in("databases/").unwrap();
|
||||
let tempdir = tempfile::TempDir::new().unwrap();
|
||||
let env = options.open(tempdir.path()).unwrap();
|
||||
let content = env.create_database(None).unwrap();
|
||||
|
||||
|
@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String {
|
||||
});
|
||||
snap
|
||||
}
|
||||
pub fn snap_field_id_docid_facet_strings(index: &Index) -> String {
|
||||
let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |(
|
||||
(field_id, doc_id, string),
|
||||
other_string,
|
||||
)| {
|
||||
&format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}")
|
||||
});
|
||||
snap
|
||||
}
|
||||
pub fn snap_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let documents_ids = index.documents_ids(&rtxn).unwrap();
|
||||
@ -423,6 +432,9 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, facet_id_string_docids) => {{
|
||||
$crate::snapshot_tests::snap_facet_id_string_docids(&$index)
|
||||
}};
|
||||
($index:ident, field_id_docid_facet_strings) => {{
|
||||
$crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index)
|
||||
}};
|
||||
($index:ident, documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_documents_ids(&$index)
|
||||
}};
|
||||
|
@ -1,8 +1,9 @@
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::new::{
|
||||
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
||||
};
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::Result;
|
||||
use crate::{Index, Result};
|
||||
use heed::Error;
|
||||
use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
@ -287,7 +288,7 @@ impl FacetsUpdateIncremental {
|
||||
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
|
||||
.count();
|
||||
|
||||
if size_highest_level < self.min_level_size {
|
||||
if size_highest_level < self.group_size * self.min_level_size {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
@ -38,12 +38,13 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
||||
let key = FacetKey { field_id, level: 0, left_bound: normalised_value };
|
||||
let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||
|
@ -592,7 +592,7 @@ mod tests {
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::BEU16;
|
||||
use crate::{db_snap, BEU16};
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -1379,6 +1379,25 @@ mod tests {
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, facet_id_string_docids, @r###"
|
||||
3 0 first 1 [1, ]
|
||||
3 0 second 1 [2, ]
|
||||
3 0 third 1 [3, ]
|
||||
3 0 zeroth 1 [0, ]
|
||||
"###);
|
||||
db_snap!(index, field_id_docid_facet_strings, @r###"
|
||||
3 0 zeroth zeroth
|
||||
3 1 first first
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let hidden = index.faceted_fields(&rtxn).unwrap();
|
||||
@ -1399,6 +1418,15 @@ mod tests {
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, facet_id_string_docids, @"");
|
||||
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
@ -1412,6 +1440,25 @@ mod tests {
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, facet_id_string_docids, @r###"
|
||||
3 0 first 1 [1, ]
|
||||
3 0 second 1 [2, ]
|
||||
3 0 third 1 [3, ]
|
||||
3 0 zeroth 1 [0, ]
|
||||
"###);
|
||||
db_snap!(index, field_id_docid_facet_strings, @r###"
|
||||
3 0 zeroth zeroth
|
||||
3 1 first first
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
|
||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||
use crate::update::FacetsUpdateIncremental;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
||||
Result,
|
||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint,
|
||||
Index, Result,
|
||||
};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
facet_id_f64_docids_iter,
|
||||
&index.facet_id_f64_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
|
||||
// is FacetGroupValue and not RoaringBitmap
|
||||
// so I need to create my own merging function
|
||||
|
||||
// facet_id_string_docids is encoded as:
|
||||
// key: FacetKeyCodec<StrRefCodec>
|
||||
// value: CboRoaringBitmapCodec
|
||||
// basically
|
||||
|
||||
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
|
||||
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
|
||||
// it is a ratio I determine empirically
|
||||
|
||||
// for now I only do it incrementally, to see if things work
|
||||
let indexer = FacetsUpdateIncremental::new(
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||
);
|
||||
|
||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||
|
||||
let mut cursor = facet_id_f64_docids_iter.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let key =
|
||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||
let docids =
|
||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||
}
|
||||
for (field_id, new_docids) in new_faceted_docids {
|
||||
let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?;
|
||||
docids |= new_docids;
|
||||
index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?;
|
||||
}
|
||||
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
||||
@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
// it is a ratio I determine empirically
|
||||
|
||||
// for now I only do it incrementally, to see if things work
|
||||
let builder = FacetsUpdateIncremental::new(
|
||||
let indexer = FacetsUpdateIncremental::new(
|
||||
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||
);
|
||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||
|
||||
let mut cursor = facet_id_string_docids.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let key =
|
||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||
let value =
|
||||
let docids =
|
||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
builder.insert(wtxn, key.field_id, key.left_bound, &value)?;
|
||||
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||
}
|
||||
for (field_id, new_docids) in new_faceted_docids {
|
||||
let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?;
|
||||
docids |= new_docids;
|
||||
index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?;
|
||||
}
|
||||
is_merged_database = true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user