diff --git a/infos/src/main.rs b/infos/src/main.rs index cc1727a68..356a5417c 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -319,6 +319,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a070c66eb..cc73cdc65 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -2,6 +2,7 @@ mod beu32_str_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod str_level_position_codec; mod str_str_u8_codec; pub mod facet; @@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs new file mode 100644 index 000000000..c421c04b5 --- /dev/null +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -0,0 +1,42 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrLevelPositionCodec; + +impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { + type DItem = (&'a str, u8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::() + size_of::() * 2; + + if bytes.len() < footer_len { return None } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + + let (level, bytes) = bytes.split_first()?; + let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; + let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + + Some((word, *level, left, right)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { + type EItem = (&'a str, u8, u32, u32); + + fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { + let left = left.to_be_bytes(); + let right = right.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.push(*level); + bytes.extend_from_slice(&left[..]); + bytes.extend_from_slice(&right[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 045eabc3c..0659b207a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -52,6 +52,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. + pub word_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -62,7 +64,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(9); + options.max_dbs(10); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -71,6 +73,7 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; + let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -94,6 +97,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fe9bd828b..de5c6511e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 2c24d9c07..250e4b13a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 8a2ba9bbf..b60b7bac2 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -88,6 +88,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents,