mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Fix some facet indexing bugs
This commit is contained in:
parent
68cbcdf08b
commit
61252248fb
@ -139,7 +139,7 @@ mod test {
|
|||||||
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
|
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
|
||||||
let mut options = heed::EnvOpenOptions::new();
|
let mut options = heed::EnvOpenOptions::new();
|
||||||
let options = options.map_size(4096 * 4 * 100);
|
let options = options.map_size(4096 * 4 * 100);
|
||||||
let tempdir = tempfile::TempDir::new_in("databases/").unwrap();
|
let tempdir = tempfile::TempDir::new().unwrap();
|
||||||
let env = options.open(tempdir.path()).unwrap();
|
let env = options.open(tempdir.path()).unwrap();
|
||||||
let content = env.create_database(None).unwrap();
|
let content = env.create_database(None).unwrap();
|
||||||
|
|
||||||
|
@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String {
|
|||||||
});
|
});
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
|
pub fn snap_field_id_docid_facet_strings(index: &Index) -> String {
|
||||||
|
let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |(
|
||||||
|
(field_id, doc_id, string),
|
||||||
|
other_string,
|
||||||
|
)| {
|
||||||
|
&format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}")
|
||||||
|
});
|
||||||
|
snap
|
||||||
|
}
|
||||||
pub fn snap_documents_ids(index: &Index) -> String {
|
pub fn snap_documents_ids(index: &Index) -> String {
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let documents_ids = index.documents_ids(&rtxn).unwrap();
|
let documents_ids = index.documents_ids(&rtxn).unwrap();
|
||||||
@ -423,6 +432,9 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, facet_id_string_docids) => {{
|
($index:ident, facet_id_string_docids) => {{
|
||||||
$crate::snapshot_tests::snap_facet_id_string_docids(&$index)
|
$crate::snapshot_tests::snap_facet_id_string_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
($index:ident, field_id_docid_facet_strings) => {{
|
||||||
|
$crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index)
|
||||||
|
}};
|
||||||
($index:ident, documents_ids) => {{
|
($index:ident, documents_ids) => {{
|
||||||
$crate::snapshot_tests::snap_documents_ids(&$index)
|
$crate::snapshot_tests::snap_documents_ids(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::new::{
|
use crate::heed_codec::facet::new::{
|
||||||
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
||||||
};
|
};
|
||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::Result;
|
use crate::{Index, Result};
|
||||||
use heed::Error;
|
use heed::Error;
|
||||||
use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn};
|
use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -287,7 +288,7 @@ impl FacetsUpdateIncremental {
|
|||||||
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
|
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
if size_highest_level < self.min_level_size {
|
if size_highest_level < self.group_size * self.min_level_size {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,12 +38,13 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let (document_id_bytes, normalized_value_bytes) =
|
let (document_id_bytes, normalized_value_bytes) =
|
||||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
||||||
let key = FacetKey { field_id, level: 0, left_bound: normalised_value };
|
let key = FacetKey { field_id, level: 0, left_bound: normalised_value };
|
||||||
let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||||
|
|
||||||
facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?;
|
facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||||
|
@ -592,7 +592,7 @@ mod tests {
|
|||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::TermsMatchingStrategy;
|
use crate::search::TermsMatchingStrategy;
|
||||||
use crate::update::DeleteDocuments;
|
use crate::update::DeleteDocuments;
|
||||||
use crate::BEU16;
|
use crate::{db_snap, BEU16};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_document_replacement() {
|
fn simple_document_replacement() {
|
||||||
@ -1379,6 +1379,25 @@ mod tests {
|
|||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, facet_id_string_docids, @r###"
|
||||||
|
3 0 first 1 [1, ]
|
||||||
|
3 0 second 1 [2, ]
|
||||||
|
3 0 third 1 [3, ]
|
||||||
|
3 0 zeroth 1 [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, field_id_docid_facet_strings, @r###"
|
||||||
|
3 0 zeroth zeroth
|
||||||
|
3 1 first first
|
||||||
|
3 2 second second
|
||||||
|
3 3 third third
|
||||||
|
"###);
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let hidden = index.faceted_fields(&rtxn).unwrap();
|
let hidden = index.faceted_fields(&rtxn).unwrap();
|
||||||
@ -1399,6 +1418,15 @@ mod tests {
|
|||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, facet_id_string_docids, @"");
|
||||||
|
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
@ -1412,6 +1440,25 @@ mod tests {
|
|||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, facet_id_string_docids, @r###"
|
||||||
|
3 0 first 1 [1, ]
|
||||||
|
3 0 second 1 [2, ]
|
||||||
|
3 0 third 1 [3, ]
|
||||||
|
3 0 zeroth 1 [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, field_id_docid_facet_strings, @r###"
|
||||||
|
3 0 zeroth zeroth
|
||||||
|
3 1 first first
|
||||||
|
3 2 second second
|
||||||
|
3 3 third third
|
||||||
|
"###);
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
|
|||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::update::FacetsUpdateIncremental;
|
use crate::update::FacetsUpdateIncremental;
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint,
|
||||||
Result,
|
Index, Result,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
|
||||||
append_entries_into_database(
|
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
|
||||||
facet_id_f64_docids_iter,
|
// is FacetGroupValue and not RoaringBitmap
|
||||||
&index.facet_id_f64_docids,
|
// so I need to create my own merging function
|
||||||
wtxn,
|
|
||||||
index_is_empty,
|
// facet_id_string_docids is encoded as:
|
||||||
|value, _buffer| Ok(value),
|
// key: FacetKeyCodec<StrRefCodec>
|
||||||
merge_cbo_roaring_bitmaps,
|
// value: CboRoaringBitmapCodec
|
||||||
)?;
|
// basically
|
||||||
|
|
||||||
|
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
|
||||||
|
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
|
||||||
|
// it is a ratio I determine empirically
|
||||||
|
|
||||||
|
// for now I only do it incrementally, to see if things work
|
||||||
|
let indexer = FacetsUpdateIncremental::new(
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||||
|
|
||||||
|
let mut cursor = facet_id_f64_docids_iter.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let key =
|
||||||
|
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||||
|
let docids =
|
||||||
|
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||||
|
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||||
|
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||||
|
}
|
||||||
|
for (field_id, new_docids) in new_faceted_docids {
|
||||||
|
let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?;
|
||||||
|
docids |= new_docids;
|
||||||
|
index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?;
|
||||||
|
}
|
||||||
|
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
||||||
@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
// it is a ratio I determine empirically
|
// it is a ratio I determine empirically
|
||||||
|
|
||||||
// for now I only do it incrementally, to see if things work
|
// for now I only do it incrementally, to see if things work
|
||||||
let builder = FacetsUpdateIncremental::new(
|
let indexer = FacetsUpdateIncremental::new(
|
||||||
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||||
);
|
);
|
||||||
|
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||||
|
|
||||||
let mut cursor = facet_id_string_docids.into_cursor()?;
|
let mut cursor = facet_id_string_docids.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let key =
|
let key =
|
||||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||||
let value =
|
let docids =
|
||||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||||
builder.insert(wtxn, key.field_id, key.left_bound, &value)?;
|
indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||||
|
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||||
|
}
|
||||||
|
for (field_id, new_docids) in new_faceted_docids {
|
||||||
|
let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?;
|
||||||
|
docids |= new_docids;
|
||||||
|
index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?;
|
||||||
}
|
}
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user