mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
feat: Introduce the "data-index" entry with merge compaction
This commit is contained in:
parent
0e856db4e6
commit
b636e5fe57
12 changed files with 251 additions and 231 deletions
|
@ -35,6 +35,10 @@ impl DocIds {
|
|||
Ok(DocIds { data })
|
||||
}
|
||||
|
||||
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
|
|
|
@ -19,7 +19,7 @@ struct Range {
|
|||
end: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
|
@ -29,15 +29,14 @@ impl DocIndexes {
|
|||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
|
||||
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
let ranges_len_offset = mmap.as_slice().len() - mem::size_of::<u64>();
|
||||
let ranges_len = (&mmap.as_slice()[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges = Data::Mmap(mmap.range(offset, range_len));
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = Data::Mmap(mmap.range(ranges_offset, ranges_len));
|
||||
|
||||
let len = mmap.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = Data::Mmap(mmap.range(offset, len));
|
||||
let indexes = Data::Mmap(mmap.range(0, ranges_offset));
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
@ -45,19 +44,22 @@ impl DocIndexes {
|
|||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let vec = Arc::new(vec);
|
||||
|
||||
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
let ranges_len_offset = vec.len() - mem::size_of::<u64>();
|
||||
let ranges_len = (&vec[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = Data::Shared {
|
||||
vec: vec.clone(),
|
||||
offset,
|
||||
len: range_len
|
||||
offset: ranges_offset,
|
||||
len: ranges_len,
|
||||
};
|
||||
|
||||
let len = vec.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = Data::Shared { vec, offset, len };
|
||||
let indexes = Data::Shared {
|
||||
vec: vec,
|
||||
offset: 0,
|
||||
len: ranges_offset,
|
||||
};
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
@ -94,6 +96,53 @@ impl Serialize for DocIndexes {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct RawDocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl RawDocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
RawDocIndexesBuilder::new(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> RawDocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
RawDocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.start).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
// write the values
|
||||
let indexes = unsafe { into_u8_slice(indexes) };
|
||||
self.wtr.write_all(indexes)
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
// write the ranges
|
||||
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
keys: BTreeMap<String, u64>,
|
||||
indexes: Vec<Vec<DocIndex>>,
|
||||
|
@ -136,29 +185,27 @@ impl<W: Write> DocIndexesBuilder<W> {
|
|||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(|_| ())
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
|
||||
for vec in &mut self.indexes {
|
||||
vec.sort_unstable();
|
||||
}
|
||||
|
||||
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
|
||||
|
||||
// write values first
|
||||
let slice = unsafe { into_u8_slice(values.as_slice()) };
|
||||
self.wtr.write_all(slice)?;
|
||||
|
||||
// write ranges after
|
||||
let slice = unsafe { into_u8_slice(ranges.as_slice()) };
|
||||
self.wtr.write_all(slice)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let len = ranges.len() as u64;
|
||||
|
||||
// TODO check if this is correct
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
unsafe {
|
||||
// write Ranges first
|
||||
let slice = into_u8_slice(ranges.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
|
||||
// write Values after
|
||||
let slice = into_u8_slice(values.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
}
|
||||
|
||||
self.wtr.flush()?;
|
||||
Ok(self.wtr)
|
||||
|
|
|
@ -7,7 +7,7 @@ use std::sync::Arc;
|
|||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::{DocIds, DocIdsBuilder};
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder, RawDocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
|
@ -19,6 +19,16 @@ enum Data {
|
|||
Mmap(MmapReadOnly),
|
||||
}
|
||||
|
||||
impl Default for Data {
|
||||
fn default() -> Data {
|
||||
Data::Shared {
|
||||
vec: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
type Target = [u8];
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue