From daa3673c1cd3d463f8f551bd61c81be2e00186bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 6 Sep 2020 10:30:53 +0200 Subject: [PATCH] Invert the word docid positions key order --- src/bin/indexer.rs | 18 ++++++++--------- ...{str_beu32_codec.rs => beu32_str_codec.rs} | 20 +++++++++---------- src/heed_codec/mod.rs | 4 ++-- src/lib.rs | 6 +++--- src/search.rs | 2 +- 5 files changed, 24 insertions(+), 26 deletions(-) rename src/heed_codec/{str_beu32_codec.rs => beu32_str_codec.rs} (56%) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 8bf5e5435..1a37874c0 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -13,10 +13,7 @@ use bstr::ByteSlice as _; use csv::StringRecord; use flate2::read::GzDecoder; use fst::IntoStreamer; -use heed::BytesDecode; -use heed::BytesEncode; -use heed::EnvOpenOptions; -use heed::types::*; +use heed::{EnvOpenOptions, BytesEncode, types::*}; use log::{debug, info}; use memmap::Mmap; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; @@ -26,7 +23,7 @@ use structopt::StructOpt; use milli::heed_codec::CsvStringRecordCodec; use milli::tokenizer::{simple_tokenizer, only_words}; -use milli::{SmallVec32, Index, DocumentId, BEU32, StrBEU32Codec}; +use milli::{SmallVec32, Index, DocumentId, BEU32}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_MILLION: usize = 1_000_000; @@ -202,12 +199,13 @@ impl Store { let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; let mut buffer = Vec::new(); + // We serialize the document ids into a buffer + // We prefix the words by the document id. + key.extend_from_slice(&id.to_be_bytes()); + for (word, positions) in iter { - key.truncate(1); + key.truncate(1 + 4); key.extend_from_slice(word.as_bytes()); - // We prefix the words by the document id. - key.extend_from_slice(&id.to_be_bytes()); - // We serialize the document ids into a buffer buffer.clear(); buffer.reserve(positions.serialized_size()); positions.serialize_into(&mut buffer)?; @@ -350,7 +348,7 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> } else if key.starts_with(&[WORD_DOCID_POSITIONS_BYTE]) { // Write the postings lists - index.word_docid_positions.as_polymorph() + index.docid_word_positions.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } diff --git a/src/heed_codec/str_beu32_codec.rs b/src/heed_codec/beu32_str_codec.rs similarity index 56% rename from src/heed_codec/str_beu32_codec.rs rename to src/heed_codec/beu32_str_codec.rs index 23c52c09c..c525d6b5b 100644 --- a/src/heed_codec/str_beu32_codec.rs +++ b/src/heed_codec/beu32_str_codec.rs @@ -2,26 +2,26 @@ use std::borrow::Cow; use std::convert::TryInto; use std::str; -pub struct StrBEU32Codec; +pub struct BEU32StrCodec; -impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { - type DItem = (&'a str, u32); +impl<'a> heed::BytesDecode<'a> for BEU32StrCodec { + type DItem = (u32, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (str_bytes, n_bytes) = bytes.split_at(bytes.len() - 4); - let s = str::from_utf8(str_bytes).ok()?; + let (n_bytes, str_bytes) = bytes.split_at(4); let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?; - Some((s, n)) + let s = str::from_utf8(str_bytes).ok()?; + Some((n, s)) } } -impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { - type EItem = (&'a str, u32); +impl<'a> heed::BytesEncode<'a> for BEU32StrCodec { + type EItem = (u32, &'a str); - fn bytes_encode((s, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s.len() + 4); - bytes.extend_from_slice(s.as_bytes()); bytes.extend_from_slice(&n.to_be_bytes()); + bytes.extend_from_slice(s.as_bytes()); Some(Cow::Owned(bytes)) } } diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index 1a0485994..fe449e23f 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,7 +1,7 @@ mod csv_string_record_codec; mod roaring_bitmap_codec; -mod str_beu32_codec; +mod beu32_str_codec; pub use self::csv_string_record_codec::CsvStringRecordCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; -pub use self::str_beu32_codec::StrBEU32Codec; +pub use self::beu32_str_codec::BEU32StrCodec; diff --git a/src/lib.rs b/src/lib.rs index 8e1d174dc..051e4c26d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ use heed::{PolyDatabase, Database}; pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; -pub use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec, CsvStringRecordCodec}; +pub use self::heed_codec::{RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec}; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; @@ -38,7 +38,7 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, /// Maps a word and a document id (u32) to all the positions where the given word appears. - pub word_docid_positions: Database, + pub docid_word_positions: Database, /// Maps the document id to the document as a CSV line. pub documents: Database, ByteSlice>, } @@ -48,7 +48,7 @@ impl Index { Ok(Index { main: env.create_poly_database(None)?, word_docids: env.create_database(Some("word-docids"))?, - word_docid_positions: env.create_database(Some("word-docid-positions"))?, + docid_word_positions: env.create_database(Some("docid-word-positions"))?, documents: env.create_database(Some("documents"))?, }) } diff --git a/src/search.rs b/src/search.rs index afbb742b0..9758862e7 100644 --- a/src/search.rs +++ b/src/search.rs @@ -151,7 +151,7 @@ impl<'a> Search<'a> { for (word, (_distance, docids)) in words { if docids.contains(candidate) { - match index.word_docid_positions.get(rtxn, &(word, candidate))? { + match index.docid_word_positions.get(rtxn, &(candidate, word))? { Some(positions) => union_positions.union_with(&positions), None => error!("position missing for candidate {} and word {}", candidate, word), }