Introduce the word_level_position_docids Index database

This commit is contained in:
Kerollmops 2021-03-11 17:24:35 +01:00 committed by many
parent 75e7b1e3da
commit b0a417f342
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
7 changed files with 54 additions and 3 deletions

View File

@ -319,6 +319,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values: _,
documents,

View File

@ -2,6 +2,7 @@ mod beu32_str_codec;
mod obkv_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
mod str_level_position_codec;
mod str_str_u8_codec;
pub mod facet;
@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -0,0 +1,42 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::mem::size_of;
use std::str;
pub struct StrLevelPositionCodec;
impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
type DItem = (&'a str, u8, u32, u32);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
if bytes.len() < footer_len { return None }
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?;
let (level, bytes) = bytes.split_first()?;
let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
Some((word, *level, left, right))
}
}
impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
type EItem = (&'a str, u8, u32, u32);
fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let left = left.to_be_bytes();
let right = right.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
bytes.extend_from_slice(word.as_bytes());
bytes.push(*level);
bytes.extend_from_slice(&left[..]);
bytes.extend_from_slice(&right[..]);
Some(Cow::Owned(bytes))
}
}

View File

@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution,
use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
use crate::{
BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec,
ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@ -52,6 +52,8 @@ pub struct Index {
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
/// Maps the word, level and position range with the docids that corresponds to it.
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
/// Maps the document id, the facet field id and the globally ordered value.
@ -62,7 +64,7 @@ pub struct Index {
impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
options.max_dbs(9);
options.max_dbs(10);
let env = options.open(path)?;
let main = env.create_poly_database(Some("main"))?;
@ -71,6 +73,7 @@ impl Index {
let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
let documents = env.create_database(Some("documents"))?;
@ -94,6 +97,7 @@ impl Index {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,

View File

@ -22,7 +22,7 @@ use serde_json::{Map, Value};
pub use self::criterion::{Criterion, default_criteria};
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index;

View File

@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,

View File

@ -88,6 +88,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
word_level_position_docids,
facet_field_id_value_docids,
field_id_docid_facet_values,
documents,