mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-11 23:48:56 +01:00
Fix bug in encoding of word_position_docids and word_fid_docids
This commit is contained in:
parent
bd9aba4d77
commit
84d9c731f8
@ -45,11 +45,12 @@ impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
|||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let footer_len = size_of::<u16>();
|
let footer_len = size_of::<u16>();
|
||||||
|
|
||||||
if bytes.len() < footer_len {
|
if bytes.len() < footer_len + 1 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||||
|
let (_, word) = word_plus_nul_byte.split_last()?;
|
||||||
let word = str::from_utf8(word).ok()?;
|
let word = str::from_utf8(word).ok()?;
|
||||||
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
|
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||||
|
|
||||||
@ -63,8 +64,9 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
|||||||
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
let pos = pos.to_be_bytes();
|
let pos = pos.to_be_bytes();
|
||||||
|
|
||||||
let mut bytes = Vec::with_capacity(word.len() + pos.len());
|
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
|
||||||
bytes.extend_from_slice(word.as_bytes());
|
bytes.extend_from_slice(word.as_bytes());
|
||||||
|
bytes.push(0);
|
||||||
bytes.extend_from_slice(&pos[..]);
|
bytes.extend_from_slice(&pos[..]);
|
||||||
|
|
||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
|
@ -126,9 +126,9 @@ pub struct Index {
|
|||||||
|
|
||||||
/// Maps the field id and the word count with the docids that corresponds to it.
|
/// Maps the field id and the word count with the docids that corresponds to it.
|
||||||
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
|
||||||
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the word and the field id with the docids that corresponds to it.
|
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
|
||||||
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the script and language with all the docids that corresponds to it.
|
/// Maps the script and language with all the docids that corresponds to it.
|
||||||
|
@ -261,22 +261,6 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
.transpose()
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_db_word_position_docids(
|
|
||||||
&mut self,
|
|
||||||
word: Interned<String>,
|
|
||||||
position: u16,
|
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
|
||||||
DatabaseCache::get_value(
|
|
||||||
self.txn,
|
|
||||||
(word, position),
|
|
||||||
&(self.word_interner.get(word).as_str(), position),
|
|
||||||
&mut self.db_cache.word_position_docids,
|
|
||||||
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
|
||||||
)?
|
|
||||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
||||||
.transpose()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_db_word_fid_docids(
|
pub fn get_db_word_fid_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
@ -361,6 +345,22 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
Ok(fids)
|
Ok(fids)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_db_word_position_docids(
|
||||||
|
&mut self,
|
||||||
|
word: Interned<String>,
|
||||||
|
position: u16,
|
||||||
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
DatabaseCache::get_value(
|
||||||
|
self.txn,
|
||||||
|
(word, position),
|
||||||
|
&(self.word_interner.get(word).as_str(), position),
|
||||||
|
&mut self.db_cache.word_position_docids,
|
||||||
|
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)?
|
||||||
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_db_word_prefix_position_docids(
|
pub fn get_db_word_prefix_position_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
word_prefix: Interned<String>,
|
word_prefix: Interned<String>,
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy};
|
use crate::{
|
||||||
|
db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
fn create_index() -> TempIndex {
|
fn create_index() -> TempIndex {
|
||||||
let index = TempIndex::new();
|
let index = TempIndex::new();
|
||||||
@ -6,7 +8,7 @@ fn create_index() -> TempIndex {
|
|||||||
index
|
index
|
||||||
.update_settings(|s| {
|
.update_settings(|s| {
|
||||||
s.set_primary_key("id".to_owned());
|
s.set_primary_key("id".to_owned());
|
||||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]);
|
||||||
s.set_criteria(vec![Criterion::Attribute]);
|
s.set_criteria(vec![Criterion::Attribute]);
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -33,20 +35,84 @@ fn create_index() -> TempIndex {
|
|||||||
"id": 4,
|
"id": 4,
|
||||||
"text": "the quick brown fox",
|
"text": "the quick brown fox",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
the quick brown fox",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
brown",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||||
|
quick brown",
|
||||||
|
},
|
||||||
]))
|
]))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_attribute_fid_simple() {
|
fn test_attribute_position_simple() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4");
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("quick brown");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_attribute_position_repeated() {
|
||||||
let index = create_index();
|
let index = create_index();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let mut s = Search::new(&txn, &index);
|
let mut s = Search::new(&txn, &index);
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
s.query("the quick brown fox");
|
s.query("a a a a a");
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]");
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,7 @@ pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
|||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
let (fid, _) = relative_from_absolute_position(position);
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
@ -39,6 +39,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
let (_, position) = relative_from_absolute_position(position);
|
let (_, position) = relative_from_absolute_position(position);
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
Loading…
Reference in New Issue
Block a user