mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-09 06:28:55 +01:00
Generate the dictionary from the first 10k documents
This commit is contained in:
parent
0d63d02ab2
commit
767f20e30d
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -3563,6 +3563,7 @@ dependencies = [
|
|||||||
"ureq",
|
"ureq",
|
||||||
"url",
|
"url",
|
||||||
"uuid",
|
"uuid",
|
||||||
|
"zstd 0.11.2+zstd.1.5.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -6406,7 +6407,7 @@ dependencies = [
|
|||||||
"time",
|
"time",
|
||||||
"zeroize",
|
"zeroize",
|
||||||
"zopfli",
|
"zopfli",
|
||||||
"zstd",
|
"zstd 0.13.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -6423,13 +6424,32 @@ dependencies = [
|
|||||||
"simd-adler32",
|
"simd-adler32",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd"
|
||||||
|
version = "0.11.2+zstd.1.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
|
||||||
|
dependencies = [
|
||||||
|
"zstd-safe 5.0.2+zstd.1.5.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zstd"
|
name = "zstd"
|
||||||
version = "0.13.2"
|
version = "0.13.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
|
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"zstd-safe",
|
"zstd-safe 7.2.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-safe"
|
||||||
|
version = "5.0.2+zstd.1.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"zstd-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] }
|
|||||||
json-depth-checker = { path = "../json-depth-checker" }
|
json-depth-checker = { path = "../json-depth-checker" }
|
||||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||||
lz4_flex = "0.11.3"
|
lz4_flex = "0.11.3"
|
||||||
|
zstd = { version = "0.11.2", features = ["zdict_builder"] }
|
||||||
memmap2 = "0.9.4"
|
memmap2 = "0.9.4"
|
||||||
obkv = "0.2.2"
|
obkv = "0.2.2"
|
||||||
once_cell = "1.19.0"
|
once_cell = "1.19.0"
|
||||||
|
@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> {
|
|||||||
pub fn decompress_with<'b>(
|
pub fn decompress_with<'b>(
|
||||||
&self,
|
&self,
|
||||||
buffer: &'b mut Vec<u8>,
|
buffer: &'b mut Vec<u8>,
|
||||||
dictionnary: &[u8],
|
dictionary: &[u8],
|
||||||
) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
|
) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
|
||||||
let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
|
let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
|
||||||
buffer.resize(size, 0);
|
buffer.resize(size, 0);
|
||||||
// TODO loop to increase the buffer size of need be
|
// TODO loop to increase the buffer size of need be
|
||||||
let size =
|
let size =
|
||||||
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?;
|
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?;
|
||||||
Ok(KvReaderU16::new(&buffer[..size]))
|
Ok(KvReaderU16::new(&buffer[..size]))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ mod transform;
|
|||||||
mod typed_chunk;
|
mod typed_chunk;
|
||||||
|
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::io::{Read, Seek};
|
use std::io::{BufWriter, Read, Seek, Write};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
@ -41,7 +41,7 @@ use crate::update::{
|
|||||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::vector::EmbeddingConfigs;
|
use crate::vector::EmbeddingConfigs;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||||
@ -568,7 +568,7 @@ where
|
|||||||
|
|
||||||
// TODO increase this number to 10k and put it in a const somewhere
|
// TODO increase this number to 10k and put it in a const somewhere
|
||||||
// I don't like that this dangerous condition is here...
|
// I don't like that this dangerous condition is here...
|
||||||
if number_of_documents > 1_000
|
if number_of_documents > 10_000
|
||||||
&& self.index.document_compression_dictionary(self.wtxn)?.is_none()
|
&& self.index.document_compression_dictionary(self.wtxn)?.is_none()
|
||||||
{
|
{
|
||||||
self.manage_compression_dictionary()?;
|
self.manage_compression_dictionary()?;
|
||||||
@ -767,17 +767,29 @@ where
|
|||||||
name = "compress_documents_database"
|
name = "compress_documents_database"
|
||||||
)]
|
)]
|
||||||
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
|
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
|
||||||
// TODO This is a dumb dictionary, just so you get the idea.
|
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||||
// We need to compute a better one by using zstd or something else.
|
let mut sample_sizes = Vec::new();
|
||||||
let dictionary = b"movietraileradventurehorror";
|
// TODO make this 1_000 be 10k and const
|
||||||
self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
|
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
|
||||||
|
for result in documents.iter(self.wtxn)?.take(10_000) {
|
||||||
|
let (_id, bytes) = result?;
|
||||||
|
sample_file.write_all(bytes)?;
|
||||||
|
sample_sizes.push(bytes.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO manage this unwrap correctly
|
||||||
|
let sample_file = sample_file.into_inner().unwrap();
|
||||||
|
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
|
||||||
|
// TODO make this 64_000 const
|
||||||
|
let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
|
||||||
|
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
|
||||||
|
|
||||||
// TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
|
// TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
|
||||||
let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
|
let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (docid, document) = result?;
|
let (docid, document) = result?;
|
||||||
// TODO manage this unwrap correctly
|
// TODO manage this unwrap correctly
|
||||||
let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
|
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary);
|
||||||
// safety the compressed document is entirely owned
|
// safety the compressed document is entirely owned
|
||||||
unsafe {
|
unsafe {
|
||||||
iter.put_current_with_options::<CompressedObkvCodec>(
|
iter.put_current_with_options::<CompressedObkvCodec>(
|
||||||
|
Loading…
Reference in New Issue
Block a user