mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
feat: Working on ops for Positive and Negative blobs
This commit is contained in:
parent
34b43d4002
commit
cc52d5dda5
18 changed files with 213 additions and 1479 deletions
72
src/data/doc_ids.rs
Normal file
72
src/data/doc_ids.rs
Normal file
|
@ -0,0 +1,72 @@
|
|||
use std::collections::BTreeSet;
|
||||
use std::slice::from_raw_parts;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::Data;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DocIds {
|
||||
doc_ids: Data,
|
||||
}
|
||||
|
||||
impl DocIds {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
let doc_ids = Data::Mmap(mmap);
|
||||
Ok(DocIds { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
let doc_ids = Data::Shared {
|
||||
vec: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len
|
||||
};
|
||||
Ok(DocIds { doc_ids })
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
}
|
||||
|
||||
pub fn doc_ids(&self) -> &[DocumentId] {
|
||||
let slice = &self.doc_ids;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / mem::size_of::<DocumentId>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIdsBuilder<W> {
|
||||
doc_ids: BTreeSet<DocumentId>,
|
||||
wrt: W,
|
||||
}
|
||||
|
||||
impl<W: io::Write> DocIdsBuilder<W> {
|
||||
pub fn new(wrt: W) -> Self {
|
||||
Self {
|
||||
doc_ids: BTreeSet::new(),
|
||||
wrt: wrt,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, doc: DocumentId) {
|
||||
self.doc_ids.insert(doc);
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
for id in self.doc_ids {
|
||||
self.wrt.write_u64::<NativeEndian>(id)?;
|
||||
}
|
||||
Ok(self.wrt)
|
||||
}
|
||||
}
|
180
src/data/doc_indexes.rs
Normal file
180
src/data/doc_indexes.rs
Normal file
|
@ -0,0 +1,180 @@
|
|||
use std::collections::btree_map::{BTreeMap, Iter, Entry};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::path::Path;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::mem;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::Data;
|
||||
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
start: u64,
|
||||
end: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
|
||||
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges = Data::Mmap(mmap.range(offset, range_len));
|
||||
|
||||
let len = mmap.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = Data::Mmap(mmap.range(offset, len));
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let vec = Arc::new(vec);
|
||||
|
||||
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges = Data::Shared {
|
||||
vec: vec.clone(),
|
||||
offset,
|
||||
len: range_len
|
||||
};
|
||||
|
||||
let len = vec.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = Data::Shared { vec, offset, len };
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn get(&self, index: u64) -> Option<&[DocIndex]> {
|
||||
self.ranges().get(index as usize).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
&self.indexes()[start..end]
|
||||
})
|
||||
}
|
||||
|
||||
fn ranges(&self) -> &[Range] {
|
||||
let slice = &self.ranges;
|
||||
let ptr = slice.as_ptr() as *const Range;
|
||||
let len = slice.len() / mem::size_of::<Range>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
|
||||
fn indexes(&self) -> &[DocIndex] {
|
||||
let slice = &self.indexes;
|
||||
let ptr = slice.as_ptr() as *const DocIndex;
|
||||
let len = slice.len() / mem::size_of::<DocIndex>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
keys: BTreeMap<String, u64>,
|
||||
indexes: Vec<Vec<DocIndex>>,
|
||||
number_docs: usize,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
Self {
|
||||
keys: BTreeMap::new(),
|
||||
indexes: Vec::new(),
|
||||
number_docs: 0,
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn number_doc_indexes(&self) -> usize {
|
||||
self.number_docs
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: String, value: DocIndex) {
|
||||
match self.keys.entry(key) {
|
||||
Entry::Vacant(e) => {
|
||||
let index = self.indexes.len() as u64;
|
||||
self.indexes.push(vec![value]);
|
||||
e.insert(index);
|
||||
},
|
||||
Entry::Occupied(e) => {
|
||||
let index = *e.get();
|
||||
let vec = &mut self.indexes[index as usize];
|
||||
vec.push(value);
|
||||
},
|
||||
}
|
||||
self.number_docs += 1;
|
||||
}
|
||||
|
||||
pub fn keys(&self) -> Iter<String, u64> {
|
||||
self.keys.iter()
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(|_| ())
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
|
||||
for vec in &mut self.indexes {
|
||||
vec.sort_unstable();
|
||||
}
|
||||
|
||||
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
|
||||
let len = ranges.len() as u64;
|
||||
|
||||
// TODO check if this is correct
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
unsafe {
|
||||
// write Ranges first
|
||||
let slice = into_u8_slice(ranges.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
|
||||
// write Values after
|
||||
let slice = into_u8_slice(values.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
}
|
||||
|
||||
self.wtr.flush()?;
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>, Vec<T>) {
|
||||
let cap = vecs.len();
|
||||
let mut ranges = Vec::with_capacity(cap);
|
||||
let mut values = Vec::with_capacity(number_docs);
|
||||
|
||||
for v in &vecs {
|
||||
let len = v.len() as u64;
|
||||
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
|
||||
|
||||
let range = Range { start, end: start + len };
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
values.extend(vecs.into_iter().flatten());
|
||||
|
||||
(ranges, values)
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * mem::size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
33
src/data/mod.rs
Normal file
33
src/data/mod.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::{DocIds, DocIdsBuilder};
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
Shared {
|
||||
vec: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
Data::Shared { vec, offset, len } => {
|
||||
&vec[*offset..offset + len]
|
||||
},
|
||||
Data::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue