Move the Index to its own module

2025-07-15 13:58:36 +02:00 · 2020-10-21 15:55:48 +02:00 · 2020-10-21 15:55:48 +02:00 · 5caf523fd9
commit 5caf523fd9
parent 2210818114
4 changed files with 121 additions and 112 deletions
--- a/src/index.rs
+++ b/src/index.rs
@ -0,0 +1,110 @@
+use anyhow::Context;
+use csv::StringRecord;
+use heed::types::*;
+use heed::{PolyDatabase, Database};
+use roaring::RoaringBitmap;
+
+use crate::Search;
+use crate::{BEU32, DocumentId};
+use crate::{
+    RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
+    CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
+};
+
+pub const WORDS_FST_KEY: &str = "words-fst";
+pub const HEADERS_KEY: &str = "headers";
+pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
+
+#[derive(Clone)]
+pub struct Index {
+    /// Contains many different types (e.g. the documents CSV headers).
+    pub main: PolyDatabase,
+    /// A word and all the documents ids containing the word.
+    pub word_docids: Database<Str, RoaringBitmapCodec>,
+    /// Maps a word and a document id (u32) to all the positions where the given word appears.
+    pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
+    /// Maps the proximity between a pair of words with all the docids where this relation appears.
+    pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
+    /// Maps the document id to the document as a CSV line.
+    pub documents: Database<OwnedType<BEU32>, ByteSlice>,
+}
+
+impl Index {
+    pub fn new(env: &heed::Env) -> anyhow::Result<Index> {
+        Ok(Index {
+            main: env.create_poly_database(Some("main"))?,
+            word_docids: env.create_database(Some("word-docids"))?,
+            docid_word_positions: env.create_database(Some("docid-word-positions"))?,
+            word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?,
+            documents: env.create_database(Some("documents"))?,
+        })
+    }
+
+    pub fn documents_ids(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<RoaringBitmap>> {
+        Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
+    }
+
+    pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> {
+        self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers)
+    }
+
+    pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> {
+        self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY)
+    }
+
+    pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
+        match self.headers(rtxn)? {
+            Some(headers) => Ok(Some(headers.len())),
+            None => Ok(None),
+        }
+    }
+
+    pub fn put_fst<A: AsRef<[u8]>>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set<A>) -> anyhow::Result<()> {
+        Ok(self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())?)
+    }
+
+    pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
+        match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? {
+            Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
+            None => Ok(None),
+        }
+    }
+
+    /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
+    pub fn documents<'t>(
+        &self,
+        rtxn: &'t heed::RoTxn,
+        iter: impl IntoIterator<Item=DocumentId>,
+    ) -> anyhow::Result<Vec<(DocumentId, StringRecord)>>
+    {
+        let ids: Vec<_> = iter.into_iter().collect();
+        let mut content = Vec::new();
+
+        for id in ids.iter().cloned() {
+            let document_content = self.documents.get(rtxn, &BEU32::new(id))?
+                .with_context(|| format!("Could not find document {}", id))?;
+            content.extend_from_slice(document_content);
+        }
+
+        let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]);
+
+        let mut documents = Vec::with_capacity(ids.len());
+        for (id, result) in ids.into_iter().zip(rdr.records()) {
+            documents.push((id, result?));
+        }
+
+        Ok(documents)
+    }
+
+    /// Returns the number of documents indexed in the database.
+    pub fn number_of_documents(&self, rtxn: &heed::RoTxn) -> anyhow::Result<usize> {
+        match self.documents_ids(rtxn)? {
+            Some(docids) => Ok(docids.len() as usize),
+            None => Ok(0),
+        }
+    }
+
+    pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> {
+        Search::new(rtxn, self)
+    }
+}
--- a/src/indexing/merge_function.rs
+++ b/src/indexing/merge_function.rs
@ -7,9 +7,9 @@ use roaring::RoaringBitmap;

 use crate::heed_codec::CboRoaringBitmapCodec;

-const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes();
-const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes();
-const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes();
+const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
+const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
+const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();

 pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
    match key {
--- a/src/indexing/store.rs
+++ b/src/indexing/store.rs
@ -29,9 +29,9 @@ const ONE_KILOBYTE: usize = 1024 * 1024;
 const MAX_POSITION: usize = 1000;
 const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;

-const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes();
-const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes();
-const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes();
+const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes();
+const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes();
+const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes();

 pub struct Readers {
    pub main: Reader<FileFuse>,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,4 +1,5 @@
 mod criterion;
+mod index;
 mod indexing;
 mod mdfs;
 mod query_tokens;
@ -11,19 +12,14 @@ pub mod tokenizer;

 use std::collections::HashMap;
 use std::hash::BuildHasherDefault;
-
-use anyhow::Context;
-use csv::StringRecord;
 use fxhash::{FxHasher32, FxHasher64};
-use heed::types::*;
-use heed::{PolyDatabase, Database};
-use roaring::RoaringBitmap;

-pub use self::update_store::UpdateStore;
-pub use self::search::{Search, SearchResult};
 pub use self::criterion::{Criterion, default_criteria};
+pub use self::index::Index;
+pub use self::search::{Search, SearchResult};
+pub use self::update_store::UpdateStore;
 pub use self::heed_codec::{
-    RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
+    RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
    CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
 };

@ -38,100 +34,3 @@ pub type DocumentId = u32;
 pub type Attribute = u32;
 pub type Position = u32;

-pub const WORDS_FST_KEY: &str = "words-fst";
-pub const HEADERS_KEY: &str = "headers";
-pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
-
-#[derive(Clone)]
-pub struct Index {
-    /// Contains many different types (e.g. the documents CSV headers).
-    pub main: PolyDatabase,
-    /// A word and all the documents ids containing the word.
-    pub word_docids: Database<Str, RoaringBitmapCodec>,
-    /// Maps a word and a document id (u32) to all the positions where the given word appears.
-    pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
-    /// Maps the proximity between a pair of words with all the docids where this relation appears.
-    pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
-    /// Maps the document id to the document as a CSV line.
-    pub documents: Database<OwnedType<BEU32>, ByteSlice>,
-}
-
-impl Index {
-    pub fn new(env: &heed::Env) -> anyhow::Result<Index> {
-        Ok(Index {
-            main: env.create_poly_database(Some("main"))?,
-            word_docids: env.create_database(Some("word-docids"))?,
-            docid_word_positions: env.create_database(Some("docid-word-positions"))?,
-            word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?,
-            documents: env.create_database(Some("documents"))?,
-        })
-    }
-
-    pub fn documents_ids(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<RoaringBitmap>> {
-        Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?)
-    }
-
-    pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> {
-        self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers)
-    }
-
-    pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> {
-        self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY)
-    }
-
-    pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> {
-        match self.headers(rtxn)? {
-            Some(headers) => Ok(Some(headers.len())),
-            None => Ok(None),
-        }
-    }
-
-    pub fn put_fst<A: AsRef<[u8]>>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set<A>) -> anyhow::Result<()> {
-        Ok(self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())?)
-    }
-
-    pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
-        match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? {
-            Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
-            None => Ok(None),
-        }
-    }
-
-    /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
-    pub fn documents<'t>(
-        &self,
-        rtxn: &'t heed::RoTxn,
-        iter: impl IntoIterator<Item=DocumentId>,
-    ) -> anyhow::Result<Vec<(DocumentId, StringRecord)>>
-    {
-        let ids: Vec<_> = iter.into_iter().collect();
-        let mut content = Vec::new();
-
-        for id in ids.iter().cloned() {
-            let document_content = self.documents.get(rtxn, &BEU32::new(id))?
-                .with_context(|| format!("Could not find document {}", id))?;
-            content.extend_from_slice(document_content);
-        }
-
-        let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]);
-
-        let mut documents = Vec::with_capacity(ids.len());
-        for (id, result) in ids.into_iter().zip(rdr.records()) {
-            documents.push((id, result?));
-        }
-
-        Ok(documents)
-    }
-
-    /// Returns the number of documents indexed in the database.
-    pub fn number_of_documents(&self, rtxn: &heed::RoTxn) -> anyhow::Result<usize> {
-        match self.documents_ids(rtxn)? {
-            Some(docids) => Ok(docids.len() as usize),
-            None => Ok(0),
-        }
-    }
-
-    pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> {
-        Search::new(rtxn, self)
-    }
-}