Fix bug in encoding of word_position_docids and word_fid_docids

This commit is contained in:
Loïc Lecrenier 2023-04-24 09:59:30 +02:00
parent bd9aba4d77
commit 84d9c731f8
6 changed files with 96 additions and 26 deletions

View File

@ -45,11 +45,12 @@ impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u16>(); let footer_len = size_of::<u16>();
if bytes.len() < footer_len { if bytes.len() < footer_len + 1 {
return None; return None;
} }
let (word, bytes) = bytes.split_at(bytes.len() - footer_len); let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
let (_, word) = word_plus_nul_byte.split_last()?;
let word = str::from_utf8(word).ok()?; let word = str::from_utf8(word).ok()?;
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
@ -63,8 +64,9 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
let pos = pos.to_be_bytes(); let pos = pos.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + pos.len()); let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
bytes.extend_from_slice(word.as_bytes()); bytes.extend_from_slice(word.as_bytes());
bytes.push(0);
bytes.extend_from_slice(&pos[..]); bytes.extend_from_slice(&pos[..]);
Some(Cow::Owned(bytes)) Some(Cow::Owned(bytes))

View File

@ -126,9 +126,9 @@ pub struct Index {
/// Maps the field id and the word count with the docids that corresponds to it. /// Maps the field id and the word count with the docids that corresponds to it.
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>, pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
/// Maps the position of a word prefix with all the docids where this prefix appears. /// Maps the word prefix and a position with all the docids where the prefix appears at the position.
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
/// Maps the word and the field id with the docids that corresponds to it. /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
/// Maps the script and language with all the docids that corresponds to it. /// Maps the script and language with all the docids that corresponds to it.

View File

@ -261,22 +261,6 @@ impl<'ctx> SearchContext<'ctx> {
.transpose() .transpose()
} }
pub fn get_db_word_position_docids(
&mut self,
word: Interned<String>,
position: u16,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
(word, position),
&(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids,
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
)?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
}
pub fn get_db_word_fid_docids( pub fn get_db_word_fid_docids(
&mut self, &mut self,
word: Interned<String>, word: Interned<String>,
@ -361,6 +345,22 @@ impl<'ctx> SearchContext<'ctx> {
Ok(fids) Ok(fids)
} }
pub fn get_db_word_position_docids(
&mut self,
word: Interned<String>,
position: u16,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
(word, position),
&(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids,
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
)?
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
.transpose()
}
pub fn get_db_word_prefix_position_docids( pub fn get_db_word_prefix_position_docids(
&mut self, &mut self,
word_prefix: Interned<String>, word_prefix: Interned<String>,

View File

@ -1,4 +1,6 @@
use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; use crate::{
db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy,
};
fn create_index() -> TempIndex { fn create_index() -> TempIndex {
let index = TempIndex::new(); let index = TempIndex::new();
@ -6,7 +8,7 @@ fn create_index() -> TempIndex {
index index
.update_settings(|s| { .update_settings(|s| {
s.set_primary_key("id".to_owned()); s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]); s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]);
s.set_criteria(vec![Criterion::Attribute]); s.set_criteria(vec![Criterion::Attribute]);
}) })
.unwrap(); .unwrap();
@ -33,20 +35,84 @@ fn create_index() -> TempIndex {
"id": 4, "id": 4,
"text": "the quick brown fox", "text": "the quick brown fox",
}, },
{
"id": 5,
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
the quick brown fox",
},
{
"id": 6,
"text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
brown",
},
{
"id": 7,
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
quick brown",
},
])) ]))
.unwrap(); .unwrap();
index index
} }
#[test] #[test]
fn test_attribute_fid_simple() { fn test_attribute_position_simple() {
let index = create_index();
db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4");
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("quick brown");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]");
}
#[test]
fn test_attribute_position_repeated() {
let index = create_index(); let index = create_index();
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index); let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox"); s.query("a a a a a");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]");
} }

View File

@ -36,6 +36,7 @@ pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
for position in read_u32_ne_bytes(value) { for position in read_u32_ne_bytes(value) {
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(word_bytes); key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (fid, _) = relative_from_absolute_position(position); let (fid, _) = relative_from_absolute_position(position);
key_buffer.extend_from_slice(&fid.to_be_bytes()); key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;

View File

@ -39,6 +39,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
for position in read_u32_ne_bytes(value) { for position in read_u32_ne_bytes(value) {
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(word_bytes); key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (_, position) = relative_from_absolute_position(position); let (_, position) = relative_from_absolute_position(position);
let position = bucketed_position(position); let position = bucketed_position(position);
key_buffer.extend_from_slice(&position.to_be_bytes()); key_buffer.extend_from_slice(&position.to_be_bytes());