From 211c8763b935b51f9beaab807195dfe7ed4cd968 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 3 May 2022 09:57:03 +0200 Subject: [PATCH] Make sure that we do not generate too long keys --- milli/src/update/index_documents/mod.rs | 3 ++- milli/src/update/word_prefix_docids.rs | 7 +++++-- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 +++++--- milli/src/update/words_prefix_position_docids.rs | 8 +++++--- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58e964986..ed2347b25 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,7 +20,8 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 2887b5583..1002c13cf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,7 +5,8 @@ use heed::types::{ByteSlice, Str}; use heed::Database; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Result, RoaringBitmapCodec}; @@ -124,7 +125,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } } diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index be0ddf005..72b41c472 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -7,8 +7,8 @@ use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, - MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Index, Result, StrStrU8Codec}; @@ -188,7 +188,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } } diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 77e9e7c29..b2b24084d 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -11,8 +11,8 @@ use crate::error::SerializationError; use crate::heed_codec::StrBEU32Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, - MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Index, Result}; @@ -167,7 +167,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } }