mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Reduce the deepness of the word position document ids
This helps reduce the number of allocations.
This commit is contained in:
parent
7178b6c2c4
commit
b12bfcb03b
@ -1,5 +1,5 @@
|
|||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::{HashMap, BTreeSet, BTreeMap};
|
use std::collections::{HashMap, BTreeSet};
|
||||||
use std::convert::{TryFrom, TryInto};
|
use std::convert::{TryFrom, TryInto};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@ -11,11 +11,12 @@ use fst::{Streamer, IntoStreamer};
|
|||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use heed::types::*;
|
use heed::types::*;
|
||||||
use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions};
|
use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions};
|
||||||
|
use rayon::prelude::*;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId, AttributeId};
|
use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId, Position};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 512;
|
const LMDB_MAX_KEY_LENGTH: usize = 512;
|
||||||
const ONE_MILLION: usize = 1_000_000;
|
const ONE_MILLION: usize = 1_000_000;
|
||||||
@ -50,8 +51,8 @@ struct Opt {
|
|||||||
|
|
||||||
struct Indexed {
|
struct Indexed {
|
||||||
fst: fst::Set<Vec<u8>>,
|
fst: fst::Set<Vec<u8>>,
|
||||||
postings_attrs: FastMap4<SmallVec32<u8>, RoaringBitmap>,
|
word_positions: FastMap4<SmallVec32<u8>, RoaringBitmap>,
|
||||||
postings_ids: FastMap4<SmallVec32<u8>, FastMap4<AttributeId, RoaringBitmap>>,
|
word_position_docids: FastMap4<(SmallVec32<u8>, Position), RoaringBitmap>,
|
||||||
headers: Vec<u8>,
|
headers: Vec<u8>,
|
||||||
documents: Vec<(DocumentId, Vec<u8>)>,
|
documents: Vec<(DocumentId, Vec<u8>)>,
|
||||||
}
|
}
|
||||||
@ -79,12 +80,12 @@ impl MtblKvStore {
|
|||||||
// we iterate over the fst to read the words in order
|
// we iterate over the fst to read the words in order
|
||||||
let mut stream = indexed.fst.stream();
|
let mut stream = indexed.fst.stream();
|
||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
if let Some(attrs) = indexed.postings_attrs.remove(word) {
|
if let Some(positions) = indexed.word_positions.get(word) {
|
||||||
key.truncate(1);
|
key.truncate(1);
|
||||||
key.extend_from_slice(word);
|
key.extend_from_slice(word);
|
||||||
// We serialize the attrs ids into a buffer
|
// We serialize the positions into a buffer
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
attrs.serialize_into(&mut buffer)?;
|
positions.serialize_into(&mut buffer)?;
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
out.add(&key, &buffer).unwrap();
|
out.add(&key, &buffer).unwrap();
|
||||||
}
|
}
|
||||||
@ -98,26 +99,27 @@ impl MtblKvStore {
|
|||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
key.truncate(1);
|
key.truncate(1);
|
||||||
key.extend_from_slice(word);
|
key.extend_from_slice(word);
|
||||||
if let Some(attrs) = indexed.postings_ids.remove(word) {
|
if let Some(positions) = indexed.word_positions.remove(word) {
|
||||||
let attrs: BTreeMap<_, _> = attrs.into_iter().collect();
|
|
||||||
// We iterate over all the attributes containing the documents ids
|
// We iterate over all the attributes containing the documents ids
|
||||||
for (attr, ids) in attrs {
|
for pos in positions {
|
||||||
// we postfix the word by the attribute id
|
let ids = indexed.word_position_docids.remove(&(SmallVec32::from(word), pos)).unwrap();
|
||||||
key.extend_from_slice(&attr.to_be_bytes());
|
// we postfix the word by the positions it appears in
|
||||||
|
let position_bytes = pos.to_be_bytes();
|
||||||
|
key.extend_from_slice(&position_bytes);
|
||||||
// We serialize the document ids into a buffer
|
// We serialize the document ids into a buffer
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
ids.serialize_into(&mut buffer)?;
|
ids.serialize_into(&mut buffer)?;
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
out.add(&key, &buffer).unwrap();
|
out.add(&key, &buffer).unwrap();
|
||||||
// And cleanup the attribute id afterward (u32 = 4 * u8)
|
// And cleanup the position afterward
|
||||||
key.truncate(key.len() - 4);
|
key.truncate(key.len() - position_bytes.len());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// postings ids keys are all prefixed by a '4'
|
// postings ids keys are all prefixed
|
||||||
key[0] = 5;
|
key[0] = 5;
|
||||||
indexed.documents.sort_unstable();
|
indexed.documents.sort_unstable_by_key(|(id, _)| *id);
|
||||||
for (id, content) in indexed.documents {
|
for (id, content) in indexed.documents {
|
||||||
key.truncate(1);
|
key.truncate(1);
|
||||||
key.extend_from_slice(&id.to_be_bytes());
|
key.extend_from_slice(&id.to_be_bytes());
|
||||||
@ -204,8 +206,8 @@ fn index_csv(
|
|||||||
eprintln!("{:?}: Indexing into an Indexed...", thread_index);
|
eprintln!("{:?}: Indexing into an Indexed...", thread_index);
|
||||||
|
|
||||||
let mut document = csv::StringRecord::new();
|
let mut document = csv::StringRecord::new();
|
||||||
let mut postings_attrs = FastMap4::default();
|
let mut word_positions = FastMap4::default();
|
||||||
let mut postings_ids = FastMap4::default();
|
let mut word_position_docids = FastMap4::default();
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
// Write the headers into a Vec of bytes.
|
// Write the headers into a Vec of bytes.
|
||||||
@ -234,12 +236,11 @@ fn index_csv(
|
|||||||
let position = (attr * MAX_POSITION + pos) as u32;
|
let position = (attr * MAX_POSITION + pos) as u32;
|
||||||
|
|
||||||
// We save the positions where this word has been seen.
|
// We save the positions where this word has been seen.
|
||||||
postings_attrs.entry(SmallVec32::from(word.as_bytes()))
|
word_positions.entry(SmallVec32::from(word.as_bytes()))
|
||||||
.or_insert_with(RoaringBitmap::new).insert(position);
|
.or_insert_with(RoaringBitmap::new).insert(position);
|
||||||
|
|
||||||
// We save the documents ids under the position and word we have seen it.
|
// We save the documents ids under the position and word we have seen it.
|
||||||
postings_ids.entry(SmallVec32::from(word.as_bytes()))
|
word_position_docids.entry((SmallVec32::from(word.as_bytes()), position)) // word + position
|
||||||
.or_insert_with(FastMap4::default).entry(position) // positions
|
|
||||||
.or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
.or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -253,14 +254,10 @@ fn index_csv(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We store the words from the postings.
|
// We store the words from the postings.
|
||||||
let mut new_words = BTreeSet::default();
|
let new_words: BTreeSet<_> = word_position_docids.iter().map(|((w, _), _)| w).collect();
|
||||||
for (word, _new_ids) in &postings_ids {
|
let fst = fst::Set::from_iter(new_words)?;
|
||||||
new_words.insert(word.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?;
|
let indexed = Indexed { fst, headers, word_positions, word_position_docids, documents };
|
||||||
|
|
||||||
let indexed = Indexed { fst: new_words_fst, headers, postings_attrs, postings_ids, documents };
|
|
||||||
eprintln!("{:?}: Indexed created!", thread_index);
|
eprintln!("{:?}: Indexed created!", thread_index);
|
||||||
|
|
||||||
MtblKvStore::from_indexed(indexed).map(|x| vec![x])
|
MtblKvStore::from_indexed(indexed).map(|x| vec![x])
|
||||||
@ -371,7 +368,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
let csv_readers: Vec<_> = (0..num_threads).map(|_| csv::Reader::from_path(&file)).collect::<Result<_, _>>()?;
|
let csv_readers: Vec<_> = (0..num_threads).map(|_| csv::Reader::from_path(&file)).collect::<Result<_, _>>()?;
|
||||||
|
|
||||||
let stores: Vec<_> = csv_readers
|
let stores: Vec<_> = csv_readers
|
||||||
.into_iter()
|
.into_par_iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, rdr)| index_csv(rdr, i, num_threads))
|
.map(|(i, rdr)| index_csv(rdr, i, num_threads))
|
||||||
.collect::<Result<_, _>>()?;
|
.collect::<Result<_, _>>()?;
|
||||||
|
@ -34,6 +34,7 @@ pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
|
|||||||
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
|
||||||
pub type DocumentId = u32;
|
pub type DocumentId = u32;
|
||||||
pub type AttributeId = u32;
|
pub type AttributeId = u32;
|
||||||
|
pub type Position = u32;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
|
Loading…
Reference in New Issue
Block a user