Document the Index types and the internal LMDB databases

This commit is contained in:
Kerollmops 2020-06-22 18:02:22 +02:00
parent 2f0e1afd16
commit ba3e805981
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 22 additions and 18 deletions

View File

@ -329,22 +329,22 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh
} }
else if key.starts_with(&[1]) { else if key.starts_with(&[1]) {
// Write the postings lists // Write the postings lists
index.postings_attrs.as_polymorph() index.word_positions.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
} }
else if key.starts_with(&[2]) { else if key.starts_with(&[2]) {
// Write the prefix postings lists // Write the prefix postings lists
index.prefix_postings_attrs.as_polymorph() index.prefix_word_positions.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
} }
else if key.starts_with(&[3]) { else if key.starts_with(&[3]) {
// Write the postings lists // Write the postings lists
index.postings_ids.as_polymorph() index.word_position_docids.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
} }
else if key.starts_with(&[4]) { else if key.starts_with(&[4]) {
// Write the prefix postings lists // Write the prefix postings lists
index.prefix_postings_ids.as_polymorph() index.prefix_word_position_docids.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
} }
else if key.starts_with(&[5]) { else if key.starts_with(&[5]) {

View File

@ -36,24 +36,28 @@ pub type AttributeId = u32;
#[derive(Clone)] #[derive(Clone)]
pub struct Index { pub struct Index {
/// Contains many different types (e.g. the documents CSV headers).
pub main: PolyDatabase, pub main: PolyDatabase,
pub postings_attrs: Database<Str, RoaringBitmapCodec>, /// A word and all the positions where it appears in the whole dataset.
pub prefix_postings_attrs: Database<ByteSlice, RoaringBitmapCodec>, pub word_positions: Database<Str, RoaringBitmapCodec>,
pub postings_ids: Database<ByteSlice, RoaringBitmapCodec>, pub prefix_word_positions: Database<Str, RoaringBitmapCodec>,
pub prefix_postings_ids: Database<ByteSlice, RoaringBitmapCodec>, /// Maps a word at a position (u32) and all the documents ids where it appears.
pub word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
pub prefix_word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
/// Maps an internal document to the content of the document in CSV.
pub documents: Database<OwnedType<BEU32>, ByteSlice>, pub documents: Database<OwnedType<BEU32>, ByteSlice>,
} }
impl Index { impl Index {
pub fn new(env: &heed::Env) -> heed::Result<Index> { pub fn new(env: &heed::Env) -> heed::Result<Index> {
let main = env.create_poly_database(None)?; Ok(Index {
let postings_attrs = env.create_database(Some("postings-attrs"))?; main: env.create_poly_database(None)?,
let prefix_postings_attrs = env.create_database(Some("prefix-postings-attrs"))?; word_positions: env.create_database(Some("word-positions"))?,
let postings_ids = env.create_database(Some("postings-ids"))?; prefix_word_positions: env.create_database(Some("prefix-word-positions"))?,
let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?; word_position_docids: env.create_database(Some("word-position-docids"))?,
let documents = env.create_database(Some("documents"))?; prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?,
documents: env.create_database(Some("documents"))?,
Ok(Index { main, postings_attrs, prefix_postings_attrs, postings_ids, prefix_postings_ids, documents }) })
} }
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> { pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
@ -107,7 +111,7 @@ impl Index {
let mut stream = fst.search(&dfa).into_stream(); let mut stream = fst.search(&dfa).into_stream();
while let Some(word) = stream.next() { while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?; let word = std::str::from_utf8(word)?;
if let Some(right) = self.postings_attrs.get(rtxn, word)? { if let Some(right) = self.word_positions.get(rtxn, word)? {
union_positions.union_with(&right); union_positions.union_with(&right);
derived_words.push((word.as_bytes().to_vec(), right)); derived_words.push((word.as_bytes().to_vec(), right));
count += 1; count += 1;
@ -131,7 +135,7 @@ impl Index {
if attrs.contains(pos) { if attrs.contains(pos) {
let mut key = word.clone(); let mut key = word.clone();
key.extend_from_slice(&pos.to_be_bytes()); key.extend_from_slice(&pos.to_be_bytes());
if let Some(right) = self.postings_ids.get(rtxn, &key).unwrap() { if let Some(right) = self.word_position_docids.get(rtxn, &key).unwrap() {
union_docids.union_with(&right); union_docids.union_with(&right);
} }
} }