From 838ed1cd32379959ca5cb1f66384b22dfd0f769b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:31:24 +0200 Subject: [PATCH 1/3] Use an u16 field id instead of one byte --- Cargo.lock | 16 ++++++++++-- milli/Cargo.toml | 3 ++- .../facet/facet_level_value_f64_codec.rs | 11 ++++---- .../facet/facet_value_string_codec.rs | 11 ++++---- .../facet/field_doc_id_facet_f64_codec.rs | 15 +++++------ .../facet/field_doc_id_facet_string_codec.rs | 18 +++++++------ .../heed_codec/field_id_word_count_codec.rs | 12 ++++++--- milli/src/heed_codec/obkv_codec.rs | 8 +++--- milli/src/index.rs | 22 +++++++++------- milli/src/lib.rs | 25 +++++++++++++++++-- milli/src/search/facet/facet_distribution.rs | 4 +-- milli/src/search/facet/mod.rs | 2 +- milli/src/update/delete_documents.rs | 6 +++-- milli/src/update/facets.rs | 14 +++++------ .../update/index_documents/merge_function.rs | 2 +- milli/src/update/index_documents/store.rs | 4 ++- milli/src/update/index_documents/transform.rs | 6 ++--- 17 files changed, 115 insertions(+), 64 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 050ba7c88..18d42029f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -341,6 +341,17 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "concat-arrays" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747" +dependencies = [ + "proc-macro2 1.0.27", + "quote 1.0.9", + "syn 1.0.73", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -1378,6 +1389,7 @@ dependencies = [ "bstr", "byteorder", "chrono", + "concat-arrays", "csv", "either", "flate2", @@ -1609,9 +1621,9 @@ dependencies = [ [[package]] name = "obkv" -version = "0.1.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" +checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" [[package]] name = "once_cell" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index dfa02f89d..6af928041 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } +concat-arrays = "0.1.2" csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" @@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" -obkv = "0.1.1" +obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs index b23dcb269..1e66427ca 100644 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::f64_into_bytes; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; // TODO do not de/serialize right bound when level = 0 pub struct FacetLevelValueF64Codec; @@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { type DItem = (FieldId, u8, f64, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; let (left, right) = if *level != 0 { @@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { (left, left) }; - Some((*field_id, *level, left, right)) + Some((field_id, *level, left, right)) } } @@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { 16 // length }; - let mut bytes = Vec::with_capacity(len + 2); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(len + 3); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.push(*level); bytes.extend_from_slice(&buffer[..len]); Some(Cow::Owned(bytes)) diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs index 259dab972..54abb7886 100644 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ b/milli/src/heed_codec/facet/facet_value_string_codec.rs @@ -1,14 +1,14 @@ use std::borrow::Cow; use std::str; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; pub struct FacetValueStringCodec; impl FacetValueStringCodec { pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 1); - out.push(field_id); + out.reserve(value.len() + 2); + out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(value.as_bytes()); } } @@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { type DItem = (FieldId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let value = str::from_utf8(bytes).ok()?; - Some((*field_id, value)) + Some((field_id, value)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs index b3c0fa381..22159601c 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId}; +use crate::{try_split_array_at, DocumentId, FieldId}; pub struct FieldDocIdFacetF64Codec; @@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { type DItem = (FieldId, DocumentId, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); - let (document_id_bytes, bytes) = bytes.split_at(4); - let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; - Some((*field_id, document_id, value)) + Some((field_id, document_id, value)) } } @@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { type EItem = (FieldId, DocumentId, f64); fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes()); let value_bytes = f64_into_bytes(*value)?; bytes.extend_from_slice(&value_bytes); diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index fd3f1143d..36408f578 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -1,8 +1,7 @@ use std::borrow::Cow; -use std::convert::TryInto; use std::str; -use crate::{DocumentId, FieldId}; +use crate::{try_split_array_at, DocumentId, FieldId}; pub struct FieldDocIdFacetStringCodec; @@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec { value: &str, out: &mut Vec, ) { - out.reserve(1 + 4 + value.len()); - out.push(field_id); + out.reserve(2 + 4 + value.len()); + out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); out.extend_from_slice(value.as_bytes()); } @@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { type DItem = (FieldId, DocumentId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; - let (document_id_bytes, bytes) = bytes.split_at(4); - let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); + let value = str::from_utf8(bytes).ok()?; - Some((*field_id, document_id, value)) + Some((field_id, document_id, value)) } } diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs index 64f0e1db6..aca7a80c4 100644 --- a/milli/src/heed_codec/field_id_word_count_codec.rs +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; -use std::convert::TryInto; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; pub struct FieldIdWordCountCodec; @@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { type DItem = (FieldId, u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + let ([word_count], _nothing) = try_split_array_at(bytes)?; Some((field_id, word_count)) } } @@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { type EItem = (FieldId, u8); fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { - Some(Cow::Owned(vec![*field_id, *word_count])) + let mut bytes = Vec::with_capacity(2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); + bytes.push(*word_count); + Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index b7414b693..6dad771a8 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -1,19 +1,19 @@ use std::borrow::Cow; -use obkv::{KvReader, KvWriter}; +use obkv::{KvReaderU16, KvWriterU16}; pub struct ObkvCodec; impl<'a> heed::BytesDecode<'a> for ObkvCodec { - type DItem = KvReader<'a>; + type DItem = KvReaderU16<'a>; fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(KvReader::new(bytes)) + Some(KvReaderU16::new(bytes)) } } impl heed::BytesEncode<'_> for ObkvCodec { - type EItem = KvWriter>; + type EItem = KvWriterU16>; fn bytes_encode(item: &Self::EItem) -> Option> { item.clone().into_inner().map(Cow::Owned).ok() diff --git a/milli/src/index.rs b/milli/src/index.rs index 247e67d52..099a5891d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -523,10 +523,11 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -536,10 +537,11 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), @@ -553,10 +555,11 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -569,7 +572,8 @@ impl Index { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), @@ -723,7 +727,7 @@ impl Index { &self, rtxn: &'t RoTxn, ids: impl IntoIterator, - ) -> Result)>> { + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { @@ -741,7 +745,7 @@ impl Index { pub fn all_documents<'t>( &self, rtxn: &'t RoTxn, - ) -> Result)>>> { + ) -> Result)>>> { Ok(self .documents .iter(rtxn)? diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ec9bc32c6..f3bababf6 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -15,6 +15,7 @@ pub mod update; use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; +use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; use std::result::Result as StdResult; @@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type Attribute = u32; pub type DocumentId = u32; -pub type FieldId = u8; +pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; @@ -58,7 +59,7 @@ type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, - obkv: obkv::KvReader, + obkv: obkv::KvReaderU16, ) -> Result> { displayed_fields .iter() @@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option { } } +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 3f55006f2..b0b22ac49 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> { K: fmt::Display, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, { - let mut key_buffer = vec![field_id]; + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { key_buffer.truncate(1); @@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> { .index .facet_id_string_docids .remap_key_type::() - .prefix_iter(self.rtxn, &[field_id])? + .prefix_iter(self.rtxn, &field_id.to_be_bytes())? .remap_key_type::(); for result in iter { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 4e900bff4..9774bdd52 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> { ) -> heed::Result> { let level = db .remap_types::() - .prefix_iter(rtxn, &[fid][..])? + .prefix_iter(rtxn, &fid.to_be_bytes())? .remap_key_type::() .last() .transpose()? diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 313f8a909..222f3b2d3 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -430,8 +430,10 @@ where C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, F: Fn(K) -> DocumentId, { - let mut iter = - db.remap_key_type::().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::(); + let mut iter = db + .remap_key_type::() + .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? + .remap_key_type::(); while let Some(result) = iter.next() { let (key, ()) = result?; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 0e2cad69d..5fabbc504 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, }; -use crate::{Index, Result}; +use crate::{FieldId, Index, Result}; pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { fn clear_field_number_levels<'t>( wtxn: &'t mut heed::RwTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> heed::Result<()> { let left = (field_id, 1, f64::MIN, f64::MIN); let right = (field_id, u8::MAX, f64::MAX, f64::MAX); @@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>( shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, - field_id: u8, + field_id: FieldId, ) -> Result> { let first_level_size = db .remap_key_type::() - .prefix_iter(rtxn, &[field_id])? + .prefix_iter(rtxn, &field_id.to_be_bytes())? .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; @@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>( fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &[field_id])? { + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { let (_key, docids) = result?; documents_ids |= docids; } @@ -210,7 +210,7 @@ fn compute_faceted_documents_ids( fn write_number_entry( writer: &mut Writer, - field_id: u8, + field_id: FieldId, level: u8, left: f64, right: f64, diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 17283b232..8613a8824 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { Ok(values.first().unwrap().to_vec()) } -pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec) { +pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 9ac97c255..ebf365f44 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -7,6 +7,7 @@ use std::time::Instant; use std::{cmp, iter}; use bstr::ByteSlice as _; +use concat_arrays::concat_arrays; use fst::Set; use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use heed::BytesEncode; @@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for ((fid, count), docids) in self.field_id_word_count_docids { docids_buffer.clear(); CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); - self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; + let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]); + self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?; } let fst = builder.into_set(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 24ab276d0..b273460d1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -626,7 +626,7 @@ mod test { Some("tata".to_string()), false, ); - assert_eq!(result.unwrap(), (0u8, "toto".to_string())); + assert_eq!(result.unwrap(), (0, "toto".to_string())); assert_eq!(fields_map.len(), 1); } @@ -635,7 +635,7 @@ mod test { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0u8, "tata".to_string())); + assert_eq!(result.unwrap(), (0, "tata".to_string())); assert_eq!(fields_map.len(), 1); } @@ -643,7 +643,7 @@ mod test { fn should_return_default_if_both_are_none() { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0u8, "id".to_string())); + assert_eq!(result.unwrap(), (0, "id".to_string())); assert_eq!(fields_map.len(), 1); } From a9553af635e6b496797634bc2a4e21d515aa49ee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:40:45 +0200 Subject: [PATCH 2/3] Add a test to check that we can index more that 256 fields --- milli/src/update/index_documents/mod.rs | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3876d5dc1..9ac05fe1a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -842,10 +842,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { mod tests { use std::io::Cursor; + use big_s::S; use heed::EnvOpenOptions; use super::*; use crate::update::DeleteDocuments; + use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1352,4 +1354,30 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_more_than_256_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut big_object = HashMap::new(); + big_object.insert(S("id"), "wow"); + for i in 0..1000 { + let key = i.to_string(); + big_object.insert(key, "I am a text!"); + } + + let content = vec![big_object]; + let content = serde_json::to_string(&content).unwrap(); + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(Cursor::new(content), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + } } From 0a7810752545c72c8dacdaa37b3673bf0fd3f37d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:48:22 +0200 Subject: [PATCH 3/3] Fix the infos crate to make it read u16 field ids --- infos/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 151e8c664..d5d1ad0af 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -7,7 +7,7 @@ use byte_unit::Byte; use heed::EnvOpenOptions; use milli::facet::FacetType; use milli::index::db_name::*; -use milli::{Index, TreeLevel}; +use milli::{FieldId, Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -322,7 +322,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> heed::Result> + 'txn>> where KC: heed::BytesDecode<'txn>, @@ -330,7 +330,7 @@ where { let iter = db .remap_key_type::() - .prefix_iter(&rtxn, &[field_id])? + .prefix_iter(&rtxn, &field_id.to_be_bytes())? .remap_key_type::(); Ok(Box::new(iter))