mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
introduce exact_word_prefix database in index
This commit is contained in:
parent
ba0bb29cd8
commit
6dd2e4ffbd
@ -61,6 +61,7 @@ pub mod db_name {
|
||||
pub const WORD_DOCIDS: &str = "word-docids";
|
||||
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
||||
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
||||
pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
|
||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||
@ -91,6 +92,9 @@ pub struct Index {
|
||||
/// A prefix of word and all the documents ids containing this prefix.
|
||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
|
||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
|
||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||
|
||||
@ -124,7 +128,7 @@ impl Index {
|
||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(15);
|
||||
options.max_dbs(16);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@ -132,6 +136,7 @@ impl Index {
|
||||
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
||||
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
|
||||
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
||||
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
|
||||
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_prefix_pair_proximity_docids =
|
||||
@ -154,6 +159,7 @@ impl Index {
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
|
@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
docid_word_positions.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
|
@ -3,7 +3,7 @@ use std::collections::HashMap;
|
||||
|
||||
use fst::IntoStreamer;
|
||||
use heed::types::{ByteSlice, Str};
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::{BytesDecode, BytesEncode, Database};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
docid_word_positions,
|
||||
word_pair_proximity_docids,
|
||||
field_id_word_count_docids,
|
||||
@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
// We write the new words FST into the main database.
|
||||
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
|
||||
|
||||
// We iterate over the word prefix docids database and remove the deleted documents ids
|
||||
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||
let mut iter = word_prefix_docids.iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (prefix, mut docids) = result?;
|
||||
let prefix = prefix.to_owned();
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.documents_ids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
prefixes_to_delete.insert(prefix)?;
|
||||
} else if docids.len() != previous_len {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&prefix, &docids)? };
|
||||
}
|
||||
}
|
||||
let prefixes_to_delete =
|
||||
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?;
|
||||
|
||||
drop(iter);
|
||||
let exact_prefix_to_delete = remove_from_word_prefix_docids(
|
||||
self.wtxn,
|
||||
exact_word_prefix_docids,
|
||||
&self.documents_ids,
|
||||
)?;
|
||||
|
||||
let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
|
||||
|
||||
// We compute the new prefix FST and write it only if there is a change.
|
||||
let prefixes_to_delete = prefixes_to_delete.into_set();
|
||||
if !prefixes_to_delete.is_empty() {
|
||||
if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() {
|
||||
let new_words_prefixes_fst = {
|
||||
// We retrieve the current words prefixes FST from the database.
|
||||
let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference();
|
||||
let difference =
|
||||
words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference();
|
||||
|
||||
// We stream the new external ids that does no more contains the to-delete external ids.
|
||||
let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
|
||||
@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_from_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &Database<Str, RoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||
|
||||
// We iterate over the word prefix docids database and remove the deleted documents ids
|
||||
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
||||
let mut iter = db.iter_mut(txn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (prefix, mut docids) = result?;
|
||||
let prefix = prefix.to_owned();
|
||||
let previous_len = docids.len();
|
||||
docids -= to_remove;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
prefixes_to_delete.insert(prefix)?;
|
||||
} else if docids.len() != previous_len {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&prefix, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(prefixes_to_delete.into_set())
|
||||
}
|
||||
|
||||
fn remove_from_word_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||
|
Loading…
Reference in New Issue
Block a user