mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 20:57:35 +01:00
Merge #522
522: Do not generate keys that are too long for LMDB r=Kerollmops a=Kerollmops This PR fixes https://github.com/meilisearch/meilisearch/issues/2338 by making sure that we do not generate keys that are too long for LMDB especially when we are creating our prefix and proximity pairs keys. Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
2fe9a02b1c
@ -20,7 +20,8 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
|||||||
pub use self::helpers::{
|
pub use self::helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
|
sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader,
|
||||||
|
ClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
@ -1651,4 +1652,55 @@ mod tests {
|
|||||||
builder.execute().unwrap();
|
builder.execute().unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn text_with_too_long_keys() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3";
|
||||||
|
|
||||||
|
// Create 200 documents with a long text
|
||||||
|
let content = {
|
||||||
|
let documents: Vec<_> = (0..200i32)
|
||||||
|
.into_iter()
|
||||||
|
.map(|i| serde_json::json!({ "id": i, "script": script }))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut writer = std::io::Cursor::new(Vec::new());
|
||||||
|
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||||
|
let documents = serde_json::to_vec(&documents).unwrap();
|
||||||
|
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
||||||
|
builder.finish().unwrap();
|
||||||
|
writer.set_position(0);
|
||||||
|
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Index those 200 long documents
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
// Create one long document
|
||||||
|
let content = documents!([
|
||||||
|
{"id": 400, "script": script },
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Index this one long document
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,8 @@ use heed::types::{ByteSlice, Str};
|
|||||||
use heed::Database;
|
use heed::Database;
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn,
|
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||||
|
CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Result, RoaringBitmapCodec};
|
use crate::{Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
@ -124,7 +125,9 @@ fn write_prefixes_in_sorter(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
sorter.insert(&key, data)?;
|
if valid_lmdb_key(&key) {
|
||||||
|
sorter.insert(&key, data)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,8 +7,8 @@ use log::debug;
|
|||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||||
MergeFn,
|
CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, StrStrU8Codec};
|
use crate::{Index, Result, StrStrU8Codec};
|
||||||
|
|
||||||
@ -188,7 +188,9 @@ fn write_prefixes_in_sorter(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
sorter.insert(&key, data)?;
|
if valid_lmdb_key(&key) {
|
||||||
|
sorter.insert(&key, data)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,8 +11,8 @@ use crate::error::SerializationError;
|
|||||||
use crate::heed_codec::StrBEU32Codec;
|
use crate::heed_codec::StrBEU32Codec;
|
||||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||||
MergeFn,
|
CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
@ -167,7 +167,9 @@ fn write_prefixes_in_sorter(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
sorter.insert(&key, data)?;
|
if valid_lmdb_key(&key) {
|
||||||
|
sorter.insert(&key, data)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user