317: Fix the facet string docids filterable deletion bug r=Kerollmops a=Kerollmops

Fixes a bug where the deletion of documents was returning a decoding error. But only when the settings are set with filterable attributes.

This bug was introduced in #254 in which we made the engine faster in returning the facet distribution. We changed the way we were storing the inverted index, we were no more storing only documents ids with the original values but also groups identified with integers, depending on the facet level we were using. This is similar to how facet numbers are already stored.

⚠️ As `@curquiza` already said, we must first revert #309 before merging this!

Related to https://github.com/meilisearch/MeiliSearch/issues/1601.

Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2021-08-23 08:57:16 +00:00 committed by GitHub
commit aa1ce97748
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,12 +4,15 @@ use std::collections::HashMap;
use chrono::Utc; use chrono::Utc;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::ClearDocuments; use super::ClearDocuments;
use crate::error::{InternalError, UserError}; use crate::error::{InternalError, SerializationError, UserError};
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::facet::{
FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec,
};
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
@ -451,26 +454,61 @@ where
Ok(()) Ok(())
} }
fn remove_docids_from_facet_field_id_string_docids<'a, C>( fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
wtxn: &'a mut heed::RwTxn, wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>, db: &heed::Database<C, D>,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
) -> heed::Result<()> ) -> crate::Result<()> {
where let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS);
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, let mut iter = db.remap_types::<ByteSlice, ByteSlice>().iter_mut(wtxn)?;
{
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, (original_value, mut docids)) = result?; let (key, val) = result?;
let previous_len = docids.len(); match FacetLevelValueU32Codec::bytes_decode(key) {
docids -= to_remove; Some(_) => {
if docids.is_empty() { // If we are able to parse this key it means it is a facet string group
// safety: we don't keep references from inside the LMDB database. // level key. We must then parse the value using the appropriate codec.
unsafe { iter.del_current()? }; let (group, mut docids) =
} else if docids.len() != previous_len { FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val)
let bytes = bytes.to_owned(); .ok_or_else(|| SerializationError::Decoding { db_name })?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&bytes, &(original_value, docids))? }; let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
} else if docids.len() != previous_len {
let key = key.to_owned();
let val = &(group, docids);
let value_bytes =
FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val)
.ok_or_else(|| SerializationError::Encoding { db_name })?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &value_bytes)? };
}
}
None => {
// The key corresponds to a level zero facet string.
let (original_value, mut docids) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val)
.ok_or_else(|| SerializationError::Decoding { db_name })?;
let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
} else if docids.len() != previous_len {
let key = key.to_owned();
let val = &(original_value, docids);
let value_bytes =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val)
.ok_or_else(|| SerializationError::Encoding { db_name })?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &value_bytes)? };
}
}
} }
} }
@ -505,10 +543,13 @@ where
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashset;
use super::*; use super::*;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, Settings, UpdateFormat};
use crate::FilterCondition;
#[test] #[test]
fn delete_documents_with_numbers_as_primary_key() { fn delete_documents_with_numbers_as_primary_key() {
@ -566,4 +607,55 @@ mod tests {
wtxn.commit().unwrap(); wtxn.commit().unwrap();
} }
#[test]
fn delete_documents_with_filterable_attributes() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_primary_key(S("docid"));
builder.set_filterable_fields(hashset! { S("label") });
builder.execute(|_, _| ()).unwrap();
let content = &br#"[
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
// Delete not all of the documents but some of them.
let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap();
builder.delete_external_id("1_4");
builder.execute().unwrap();
let filter = FilterCondition::from_str(&wtxn, &index, "label = sign").unwrap();
let results = index.search(&wtxn).filter(filter).execute().unwrap();
assert!(results.documents_ids.is_empty());
wtxn.commit().unwrap();
}
} }