mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
feat: Improve the deserialization time of a Blob
This commit is contained in:
parent
a18401f47e
commit
6bd779f9ae
11 changed files with 198 additions and 227 deletions
|
@ -1,43 +1,45 @@
|
|||
use std::slice::from_raw_parts;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds {
|
||||
data: Data,
|
||||
data: SharedData,
|
||||
}
|
||||
|
||||
impl DocIds {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
let data = Data::Mmap(mmap);
|
||||
Ok(DocIds { data })
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
// FIXME check if modulo DocumentId
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
let data = Data::Shared {
|
||||
bytes: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len
|
||||
};
|
||||
DocIds::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = SharedData { bytes, offset, len };
|
||||
DocIds::from_data(data)
|
||||
}
|
||||
|
||||
fn from_data(data: SharedData) -> io::Result<Self> {
|
||||
let len = data.as_ref().read_u64::<LittleEndian>()?;
|
||||
let data = data.range(mem::size_of::<u64>(), len as usize);
|
||||
Ok(DocIds { data })
|
||||
}
|
||||
|
||||
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||
pub fn from_raw(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.data.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.data);
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
|
@ -51,9 +53,3 @@ impl DocIds {
|
|||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for DocIds {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.data.as_ref().serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,15 +2,13 @@ use std::slice::from_raw_parts;
|
|||
use std::io::{self, Write};
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
|
@ -21,27 +19,22 @@ struct Range {
|
|||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
DocIndexes::from_data(Data::Mmap(mmap))
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = Data::Shared { bytes, offset, len };
|
||||
let data = SharedData { bytes, offset, len };
|
||||
DocIndexes::from_data(data)
|
||||
}
|
||||
|
||||
fn from_data(data: Data) -> io::Result<Self> {
|
||||
fn from_data(data: SharedData) -> io::Result<Self> {
|
||||
let ranges_len_offset = data.len() - size_of::<u64>();
|
||||
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize;
|
||||
|
|
|
@ -4,40 +4,30 @@ mod doc_indexes;
|
|||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
Shared {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
struct SharedData {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
pub fn range(&self, off: usize, l: usize) -> Data {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
assert!(off + l <= *len);
|
||||
Data::Shared {
|
||||
bytes: bytes.clone(),
|
||||
offset: offset + off,
|
||||
len: l,
|
||||
}
|
||||
},
|
||||
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
|
||||
impl SharedData {
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Data {
|
||||
fn default() -> Data {
|
||||
Data::Shared {
|
||||
impl Default for SharedData {
|
||||
fn default() -> SharedData {
|
||||
SharedData {
|
||||
bytes: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
|
@ -45,7 +35,7 @@ impl Default for Data {
|
|||
}
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
|
@ -53,13 +43,8 @@ impl Deref for Data {
|
|||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Data {
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
&bytes[*offset..offset + len]
|
||||
},
|
||||
Data::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue