mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Introduce a database to store all docids for a word and attribute
This commit is contained in:
parent
a044cb6cc8
commit
374ec6773f
@ -1,5 +1,6 @@
|
|||||||
use std::collections::{BTreeSet, BTreeMap};
|
use std::collections::hash_map::Entry;
|
||||||
use std::convert::TryFrom;
|
use std::collections::{HashMap, BTreeSet, BTreeMap};
|
||||||
|
use std::convert::{TryFrom, TryInto};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
@ -344,6 +345,46 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compute_words_attributes_docids(wtxn: &mut heed::RwTxn, index: &Index) -> anyhow::Result<()> {
|
||||||
|
eprintln!("Computing the attributes documents ids...");
|
||||||
|
|
||||||
|
let fst = match index.fst(&wtxn)? {
|
||||||
|
Some(fst) => fst.map_data(|s| s.to_vec())?,
|
||||||
|
None => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut word_attributes = HashMap::new();
|
||||||
|
let mut stream = fst.stream();
|
||||||
|
while let Some(word) = stream.next() {
|
||||||
|
word_attributes.clear();
|
||||||
|
|
||||||
|
// Loop on the word attributes and unions all the documents ids by attribute.
|
||||||
|
for result in index.word_position_docids.prefix_iter(wtxn, word)? {
|
||||||
|
let (key, docids) = result?;
|
||||||
|
let (_key_word, key_pos) = key.split_at(key.len() - 4);
|
||||||
|
let key_pos = key_pos.try_into().map(u32::from_be_bytes)?;
|
||||||
|
// If the key corresponds to the word (minus the attribute)
|
||||||
|
if key.len() == word.len() + 4 {
|
||||||
|
let attribute = key_pos / 1000;
|
||||||
|
match word_attributes.entry(attribute) {
|
||||||
|
Entry::Vacant(entry) => { entry.insert(docids); },
|
||||||
|
Entry::Occupied(mut entry) => entry.get_mut().union_with(&docids),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write this word attributes unions into LMDB.
|
||||||
|
let mut key = word.to_vec();
|
||||||
|
for (attribute, docids) in word_attributes.drain() {
|
||||||
|
key.truncate(word.len());
|
||||||
|
key.extend_from_slice(&attribute.to_be_bytes());
|
||||||
|
index.word_attribute_docids.put(wtxn, &key, &docids)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
let opt = Opt::from_args();
|
let opt = Opt::from_args();
|
||||||
|
|
||||||
@ -351,7 +392,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
let env = EnvOpenOptions::new()
|
let env = EnvOpenOptions::new()
|
||||||
.map_size(100 * 1024 * 1024 * 1024) // 100 GB
|
.map_size(100 * 1024 * 1024 * 1024) // 100 GB
|
||||||
.max_readers(10)
|
.max_readers(10)
|
||||||
.max_dbs(5)
|
.max_dbs(10)
|
||||||
.open(opt.database)?;
|
.open(opt.database)?;
|
||||||
|
|
||||||
let index = Index::new(&env)?;
|
let index = Index::new(&env)?;
|
||||||
@ -370,7 +411,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
eprintln!("We are writing into LMDB...");
|
eprintln!("We are writing into LMDB...");
|
||||||
let mut wtxn = env.write_txn()?;
|
let mut wtxn = env.write_txn()?;
|
||||||
MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?;
|
MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?;
|
||||||
// FIXME Why is this count wrong? (indicates 99 when must return 100)
|
compute_words_attributes_docids(&mut wtxn, &index)?;
|
||||||
let count = index.documents.len(&wtxn)?;
|
let count = index.documents.len(&wtxn)?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
eprintln!("Wrote {} documents into LMDB", count);
|
eprintln!("Wrote {} documents into LMDB", count);
|
||||||
|
@ -30,7 +30,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
let env = EnvOpenOptions::new()
|
let env = EnvOpenOptions::new()
|
||||||
.map_size(100 * 1024 * 1024 * 1024) // 100 GB
|
.map_size(100 * 1024 * 1024 * 1024) // 100 GB
|
||||||
.max_readers(10)
|
.max_readers(10)
|
||||||
.max_dbs(5)
|
.max_dbs(10)
|
||||||
.open(opt.database)?;
|
.open(opt.database)?;
|
||||||
|
|
||||||
let index = Index::new(&env)?;
|
let index = Index::new(&env)?;
|
||||||
|
16
src/lib.rs
16
src/lib.rs
@ -44,6 +44,8 @@ pub struct Index {
|
|||||||
/// Maps a word at a position (u32) and all the documents ids where it appears.
|
/// Maps a word at a position (u32) and all the documents ids where it appears.
|
||||||
pub word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
|
pub word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
|
||||||
pub prefix_word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
|
pub prefix_word_position_docids: Database<ByteSlice, RoaringBitmapCodec>,
|
||||||
|
/// Maps a word and an attribute (u32) to all the documents ids that it appears in.
|
||||||
|
pub word_attribute_docids: Database<ByteSlice, RoaringBitmapCodec>,
|
||||||
/// Maps an internal document to the content of the document in CSV.
|
/// Maps an internal document to the content of the document in CSV.
|
||||||
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
}
|
}
|
||||||
@ -56,6 +58,7 @@ impl Index {
|
|||||||
prefix_word_positions: env.create_database(Some("prefix-word-positions"))?,
|
prefix_word_positions: env.create_database(Some("prefix-word-positions"))?,
|
||||||
word_position_docids: env.create_database(Some("word-position-docids"))?,
|
word_position_docids: env.create_database(Some("word-position-docids"))?,
|
||||||
prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?,
|
prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?,
|
||||||
|
word_attribute_docids: env.create_database(Some("word-attribute-docids"))?,
|
||||||
documents: env.create_database(Some("documents"))?,
|
documents: env.create_database(Some("documents"))?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -64,10 +67,17 @@ impl Index {
|
|||||||
self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
|
self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
|
||||||
|
match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
|
||||||
|
Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
|
pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
|
||||||
let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
|
let fst = match self.fst(rtxn)? {
|
||||||
Some(bytes) => fst::Set::new(bytes)?,
|
Some(fst) => fst,
|
||||||
None => return Ok(Vec::new()),
|
None => return Ok(vec![]),
|
||||||
};
|
};
|
||||||
|
|
||||||
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
|
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user