mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-09 06:28:55 +01:00
Clean up some parts of the code
This commit is contained in:
parent
4ceade43cd
commit
fd8c90b858
@ -566,13 +566,9 @@ where
|
|||||||
word_fid_docids.map(MergerBuilder::build),
|
word_fid_docids.map(MergerBuilder::build),
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// TODO increase this number to 10k and put it in a const somewhere
|
// This call contains an internal condition to ensure we do not always
|
||||||
// I don't like that this dangerous condition is here...
|
// generate compression dictionaries and always compress documents.
|
||||||
if number_of_documents > 10_000
|
self.manage_compression_dictionary()?;
|
||||||
&& self.index.document_compression_dictionary(self.wtxn)?.is_none()
|
|
||||||
{
|
|
||||||
self.manage_compression_dictionary()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(number_of_documents)
|
Ok(number_of_documents)
|
||||||
}
|
}
|
||||||
@ -767,32 +763,42 @@ where
|
|||||||
name = "compress_documents_database"
|
name = "compress_documents_database"
|
||||||
)]
|
)]
|
||||||
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
|
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
|
||||||
|
/// The size of the dictionary generated from a sample of the documents already
|
||||||
|
/// in the database. It will be used when compressing and decompressing documents.
|
||||||
|
const COMPRESSION_DICTIONARY_SIZE: usize = 64_000;
|
||||||
|
/// The minimum number of documents to trigger the generation of the compression dictionary.
|
||||||
|
const COMPRESSION_ON_NUMBER_OF_DOCUMENTS: usize = 10_000;
|
||||||
|
|
||||||
|
if self.index.number_of_documents(self.wtxn)? < COMPRESSION_ON_NUMBER_OF_DOCUMENTS as u64
|
||||||
|
|| self.index.document_compression_dictionary(self.wtxn)?.is_some()
|
||||||
|
{
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
|
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||||
let mut sample_sizes = Vec::new();
|
let mut sample_sizes = Vec::new();
|
||||||
// TODO make this 1_000 be 10k and const
|
// TODO make this 1_000 be 10k and const
|
||||||
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
|
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
|
||||||
for result in documents.iter(self.wtxn)?.take(10_000) {
|
for result in documents.iter(self.wtxn)?.take(COMPRESSION_ON_NUMBER_OF_DOCUMENTS) {
|
||||||
let (_id, bytes) = result?;
|
let (_id, bytes) = result?;
|
||||||
sample_file.write_all(bytes)?;
|
sample_file.write_all(bytes)?;
|
||||||
sample_sizes.push(bytes.len());
|
sample_sizes.push(bytes.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO manage this unwrap correctly
|
let sample_file = sample_file.into_inner().map_err(|ie| ie.into_error())?;
|
||||||
let sample_file = sample_file.into_inner().unwrap();
|
|
||||||
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
|
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
|
||||||
// TODO make this 64_000 const
|
let dictionary =
|
||||||
let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
|
zstd::dict::from_continuous(&sample_data, &sample_sizes, COMPRESSION_DICTIONARY_SIZE)?;
|
||||||
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
|
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
|
||||||
// safety: We just set the dictionary above, it must be there when we get it back.
|
// safety: We just set the dictionary above. It must be there when we get it back.
|
||||||
let dictionary = self.index.document_compression_dictionary(self.wtxn)?.unwrap();
|
let dictionary = self.index.document_compression_dictionary(self.wtxn)?.unwrap();
|
||||||
|
|
||||||
// TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
|
|
||||||
let mut iter = self.index.documents.iter_mut(self.wtxn)?;
|
let mut iter = self.index.documents.iter_mut(self.wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (docid, document) = result?;
|
let (docid, document) = result?;
|
||||||
let document = document.as_non_compressed().as_bytes();
|
let document = document.as_non_compressed().as_bytes();
|
||||||
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary)?;
|
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary)?;
|
||||||
// safety the compressed document is entirely owned
|
// safety: the compressed document is entirely owned
|
||||||
unsafe {
|
unsafe {
|
||||||
iter.put_current_with_options::<CompressedObkvCodec>(
|
iter.put_current_with_options::<CompressedObkvCodec>(
|
||||||
PutFlags::empty(),
|
PutFlags::empty(),
|
||||||
|
Loading…
Reference in New Issue
Block a user