mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
Directly use a writer for the docid word positions
This commit is contained in:
parent
67577a3760
commit
99705deb7d
@ -1,4 +1,4 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
@ -86,7 +86,7 @@ struct IndexerOpt {
|
|||||||
max_nb_chunks: Option<usize>,
|
max_nb_chunks: Option<usize>,
|
||||||
|
|
||||||
/// MTBL max memory in bytes.
|
/// MTBL max memory in bytes.
|
||||||
#[structopt(long, default_value = "346030080")] // 330 MB
|
#[structopt(long, default_value = "440401920")] // 420 MB
|
||||||
max_memory: usize,
|
max_memory: usize,
|
||||||
|
|
||||||
/// Size of the linked hash map cache when indexing.
|
/// Size of the linked hash map cache when indexing.
|
||||||
@ -198,6 +198,14 @@ fn compute_words_pair_proximities(
|
|||||||
|
|
||||||
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
||||||
|
|
||||||
|
struct Readers {
|
||||||
|
main: Reader<Mmap>,
|
||||||
|
word_docids: Reader<Mmap>,
|
||||||
|
docid_word_positions: Reader<Mmap>,
|
||||||
|
words_pairs_proximities_docids: Reader<Mmap>,
|
||||||
|
documents: Reader<Mmap>,
|
||||||
|
}
|
||||||
|
|
||||||
struct Store {
|
struct Store {
|
||||||
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||||
word_docids_limit: usize,
|
word_docids_limit: usize,
|
||||||
@ -210,20 +218,12 @@ struct Store {
|
|||||||
// MTBL sorters
|
// MTBL sorters
|
||||||
main_sorter: Sorter<MergeFn>,
|
main_sorter: Sorter<MergeFn>,
|
||||||
word_docids_sorter: Sorter<MergeFn>,
|
word_docids_sorter: Sorter<MergeFn>,
|
||||||
docid_word_positions_sorter: Sorter<MergeFn>,
|
|
||||||
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
||||||
// MTBL writers
|
// MTBL writers
|
||||||
|
docid_word_positions_writer: Writer<File>,
|
||||||
documents_writer: Writer<File>,
|
documents_writer: Writer<File>,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Readers {
|
|
||||||
main: Reader<Mmap>,
|
|
||||||
word_docids: Reader<Mmap>,
|
|
||||||
docid_word_positions: Reader<Mmap>,
|
|
||||||
words_pairs_proximities_docids: Reader<Mmap>,
|
|
||||||
documents: Reader<Mmap>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Store {
|
impl Store {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
linked_hash_map_size: usize,
|
linked_hash_map_size: usize,
|
||||||
@ -247,13 +247,6 @@ impl Store {
|
|||||||
max_nb_chunks,
|
max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
let docid_word_positions_sorter = create_sorter(
|
|
||||||
docid_word_positions_merge,
|
|
||||||
chunk_compression_type,
|
|
||||||
chunk_compression_level,
|
|
||||||
max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
);
|
|
||||||
let words_pairs_proximities_docids_sorter = create_sorter(
|
let words_pairs_proximities_docids_sorter = create_sorter(
|
||||||
words_pairs_proximities_docids_merge,
|
words_pairs_proximities_docids_merge,
|
||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
@ -262,12 +255,12 @@ impl Store {
|
|||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut documents_builder = Writer::builder();
|
let documents_writer = tempfile().map(|f| {
|
||||||
documents_builder.compression_type(chunk_compression_type);
|
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||||
if let Some(level) = chunk_compression_level {
|
})?;
|
||||||
documents_builder.compression_level(level);
|
let docid_word_positions_writer = tempfile().map(|f| {
|
||||||
}
|
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||||
let documents_writer = tempfile().map(|f| documents_builder.build(f))?;
|
})?;
|
||||||
|
|
||||||
Ok(Store {
|
Ok(Store {
|
||||||
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||||
@ -280,9 +273,9 @@ impl Store {
|
|||||||
|
|
||||||
main_sorter,
|
main_sorter,
|
||||||
word_docids_sorter,
|
word_docids_sorter,
|
||||||
docid_word_positions_sorter,
|
|
||||||
words_pairs_proximities_docids_sorter,
|
words_pairs_proximities_docids_sorter,
|
||||||
|
|
||||||
|
docid_word_positions_writer,
|
||||||
documents_writer,
|
documents_writer,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -372,7 +365,7 @@ impl Store {
|
|||||||
|
|
||||||
self.documents_ids.insert(document_id);
|
self.documents_ids.insert(document_id);
|
||||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||||
Self::write_docid_word_positions(&mut self.docid_word_positions_sorter, document_id, words_positions)?;
|
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -406,7 +399,7 @@ impl Store {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn write_docid_word_positions(
|
fn write_docid_word_positions(
|
||||||
sorter: &mut Sorter<MergeFn>,
|
writer: &mut Writer<File>,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
@ -415,6 +408,9 @@ impl Store {
|
|||||||
let mut key = id.to_be_bytes().to_vec();
|
let mut key = id.to_be_bytes().to_vec();
|
||||||
let base_size = key.len();
|
let base_size = key.len();
|
||||||
|
|
||||||
|
// We order the words lexicographically, this way we avoid passing by a sorter.
|
||||||
|
let words_positions = BTreeMap::from_iter(words_positions);
|
||||||
|
|
||||||
for (word, positions) in words_positions {
|
for (word, positions) in words_positions {
|
||||||
key.truncate(base_size);
|
key.truncate(base_size);
|
||||||
key.extend_from_slice(word.as_bytes());
|
key.extend_from_slice(word.as_bytes());
|
||||||
@ -424,7 +420,7 @@ impl Store {
|
|||||||
.with_context(|| "could not serialize positions")?;
|
.with_context(|| "could not serialize positions")?;
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
if lmdb_key_valid_size(&key) {
|
if lmdb_key_valid_size(&key) {
|
||||||
sorter.insert(&key, &bytes)?;
|
writer.insert(&key, &bytes)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -542,16 +538,13 @@ impl Store {
|
|||||||
let mut main_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
|
let mut main_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
self.main_sorter.write_into(&mut main_wtr)?;
|
self.main_sorter.write_into(&mut main_wtr)?;
|
||||||
|
|
||||||
let mut docid_word_positions_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
|
|
||||||
self.docid_word_positions_sorter.write_into(&mut docid_word_positions_wtr)?;
|
|
||||||
|
|
||||||
let mut words_pairs_proximities_docids_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
|
let mut words_pairs_proximities_docids_wtr = tempfile().map(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||||
|
|
||||||
let main = writer_into_reader(main_wtr)?;
|
let main = writer_into_reader(main_wtr)?;
|
||||||
let word_docids = writer_into_reader(word_docids_wtr)?;
|
let word_docids = writer_into_reader(word_docids_wtr)?;
|
||||||
let docid_word_positions = writer_into_reader(docid_word_positions_wtr)?;
|
|
||||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr)?;
|
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr)?;
|
||||||
|
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer)?;
|
||||||
let documents = writer_into_reader(self.documents_writer)?;
|
let documents = writer_into_reader(self.documents_writer)?;
|
||||||
|
|
||||||
Ok(Readers {
|
Ok(Readers {
|
||||||
@ -602,7 +595,7 @@ fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
||||||
panic!("merging word docid positions is an error ({:?})", key.as_bstr())
|
panic!("merging docid word positions is an error ({:?})", key.as_bstr())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user