Document the Index types and the internal LMDB databases

This commit is contained in:
Kerollmops 2020-06-22 18:02:22 +02:00
parent 2f0e1afd16
commit ba3e805981
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 22 additions and 18 deletions

View File

@ -329,22 +329,22 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh
}
else if key.starts_with(&[1]) {
// Write the postings lists
index.postings_attrs.as_polymorph()
index.word_positions.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
}
else if key.starts_with(&[2]) {
// Write the prefix postings lists
index.prefix_postings_attrs.as_polymorph()
index.prefix_word_positions.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
}
else if key.starts_with(&[3]) {
// Write the postings lists
index.postings_ids.as_polymorph()
index.word_position_docids.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
}
else if key.starts_with(&[4]) {
// Write the prefix postings lists
index.prefix_postings_ids.as_polymorph()
index.prefix_word_position_docids.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
}
else if key.starts_with(&[5]) {

View File

@ -36,24 +36,28 @@ pub type AttributeId = u32;
#[derive(Clone)]
pub struct Index {
/// Contains many different types (e.g. the documents CSV headers).
pub main: PolyDatabase,
pub postings_attrs: Database<Str, RoaringBitmapCodec>,
pub prefix_postings_attrs: Database<ByteSlice, RoaringBitmapCodec>,
pub postings_ids: Database<ByteSlice, RoaringBitmapCodec>,
pub prefix_postings_ids: Database<ByteSlice, RoaringBitmapCodec>,
/// A word and all the positions where it appears in the whole dataset.
pub word_positions: Database<Str, RoaringBitmapCodec>,
pub prefix_word_positions: Database<Str, RoaringBitmapCodec>,
/// Maps a word at a position (u32) and all the documents ids where it appears.
pub word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
pub prefix_word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
/// Maps an internal document to the content of the document in CSV.
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
}
impl Index {
pub fn new(env: &heed::Env) -> heed::Result<Index> {
let main = env.create_poly_database(None)?;
let postings_attrs = env.create_database(Some("postings-attrs"))?;
let prefix_postings_attrs = env.create_database(Some("prefix-postings-attrs"))?;
let postings_ids = env.create_database(Some("postings-ids"))?;
let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;
let documents = env.create_database(Some("documents"))?;
Ok(Index { main, postings_attrs, prefix_postings_attrs, postings_ids, prefix_postings_ids, documents })
Ok(Index {
main: env.create_poly_database(None)?,
word_positions: env.create_database(Some("word-positions"))?,
prefix_word_positions: env.create_database(Some("prefix-word-positions"))?,
word_position_docids: env.create_database(Some("word-position-docids"))?,
prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?,
documents: env.create_database(Some("documents"))?,
})
}
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
@ -107,7 +111,7 @@ impl Index {
let mut stream = fst.search(&dfa).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
if let Some(right) = self.postings_attrs.get(rtxn, word)? {
if let Some(right) = self.word_positions.get(rtxn, word)? {
union_positions.union_with(&right);
derived_words.push((word.as_bytes().to_vec(), right));
count += 1;
@ -131,7 +135,7 @@ impl Index {
if attrs.contains(pos) {
let mut key = word.clone();
key.extend_from_slice(&pos.to_be_bytes());
if let Some(right) = self.postings_ids.get(rtxn, &key).unwrap() {
if let Some(right) = self.word_position_docids.get(rtxn, &key).unwrap() {
union_docids.union_with(&right);
}
}