mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
introduce exact_word_prefix database in index
This commit is contained in:
parent
ba0bb29cd8
commit
6dd2e4ffbd
@ -61,6 +61,7 @@ pub mod db_name {
|
|||||||
pub const WORD_DOCIDS: &str = "word-docids";
|
pub const WORD_DOCIDS: &str = "word-docids";
|
||||||
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
||||||
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
||||||
|
pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
|
||||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||||
@ -91,6 +92,9 @@ pub struct Index {
|
|||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
|
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
||||||
|
|
||||||
@ -124,7 +128,7 @@ impl Index {
|
|||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(15);
|
options.max_dbs(16);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -132,6 +136,7 @@ impl Index {
|
|||||||
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
||||||
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
|
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
|
||||||
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
||||||
|
let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
|
||||||
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
||||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let word_prefix_pair_proximity_docids =
|
let word_prefix_pair_proximity_docids =
|
||||||
@ -154,6 +159,7 @@ impl Index {
|
|||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
exact_word_docids.clear(self.wtxn)?;
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
word_prefix_docids.clear(self.wtxn)?;
|
word_prefix_docids.clear(self.wtxn)?;
|
||||||
|
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
|
@ -3,7 +3,7 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::types::{ByteSlice, Str};
|
use heed::types::{ByteSlice, Str};
|
||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode, Database};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_docids,
|
word_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// We write the new words FST into the main database.
|
// We write the new words FST into the main database.
|
||||||
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
|
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
|
||||||
|
|
||||||
// We iterate over the word prefix docids database and remove the deleted documents ids
|
let prefixes_to_delete =
|
||||||
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?;
|
||||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
|
||||||
let mut iter = word_prefix_docids.iter_mut(self.wtxn)?;
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let (prefix, mut docids) = result?;
|
|
||||||
let prefix = prefix.to_owned();
|
|
||||||
let previous_len = docids.len();
|
|
||||||
docids -= &self.documents_ids;
|
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
prefixes_to_delete.insert(prefix)?;
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&prefix, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
let exact_prefix_to_delete = remove_from_word_prefix_docids(
|
||||||
|
self.wtxn,
|
||||||
|
exact_word_prefix_docids,
|
||||||
|
&self.documents_ids,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
|
||||||
|
|
||||||
// We compute the new prefix FST and write it only if there is a change.
|
// We compute the new prefix FST and write it only if there is a change.
|
||||||
let prefixes_to_delete = prefixes_to_delete.into_set();
|
if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() {
|
||||||
if !prefixes_to_delete.is_empty() {
|
|
||||||
let new_words_prefixes_fst = {
|
let new_words_prefixes_fst = {
|
||||||
// We retrieve the current words prefixes FST from the database.
|
// We retrieve the current words prefixes FST from the database.
|
||||||
let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference();
|
let difference =
|
||||||
|
words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference();
|
||||||
|
|
||||||
// We stream the new external ids that does no more contains the to-delete external ids.
|
// We stream the new external ids that does no more contains the to-delete external ids.
|
||||||
let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
|
let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
|
||||||
@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_from_word_prefix_docids(
|
||||||
|
txn: &mut heed::RwTxn,
|
||||||
|
db: &Database<Str, RoaringBitmapCodec>,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
|
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||||
|
|
||||||
|
// We iterate over the word prefix docids database and remove the deleted documents ids
|
||||||
|
// from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
|
||||||
|
let mut iter = db.iter_mut(txn)?;
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let (prefix, mut docids) = result?;
|
||||||
|
let prefix = prefix.to_owned();
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_remove;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
prefixes_to_delete.insert(prefix)?;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&prefix, &docids)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(iter);
|
||||||
|
|
||||||
|
Ok(prefixes_to_delete.into_set())
|
||||||
|
}
|
||||||
|
|
||||||
fn remove_from_word_docids(
|
fn remove_from_word_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||||
|
Loading…
Reference in New Issue
Block a user