mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Split position DB into fid and relative position DB
This commit is contained in:
parent
56b7209f26
commit
9b2653427d
11 changed files with 162 additions and 135 deletions
|
@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
word_prefix_position_docids,
|
||||
word_prefix_fid_docids,
|
||||
script_language_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
|
@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
word_fid_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
word_prefix_position_docids.clear(self.wtxn)?;
|
||||
word_prefix_fid_docids.clear(self.wtxn)?;
|
||||
script_language_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
|
|
|
@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use fst::IntoStreamer;
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||
use heed::Database;
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
|
||||
use heed::{BytesDecode, BytesEncode, Database, RwIter};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_prefix_position_docids,
|
||||
word_fid_docids,
|
||||
word_prefix_fid_docids,
|
||||
facet_id_f64_docids: _,
|
||||
facet_id_string_docids: _,
|
||||
field_id_docid_facet_f64s: _,
|
||||
|
@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
|
||||
// We delete the documents ids from the word prefix pair proximity database docids
|
||||
// and remove the empty pairs too.
|
||||
let db = db.remap_key_type::<ByteSlice>();
|
||||
let mut iter = db.iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (key, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&key, &docids)? };
|
||||
}
|
||||
}
|
||||
Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
|
||||
}
|
||||
|
||||
// We delete the documents ids that are under the pairs of words,
|
||||
// it is faster and use no memory to iterate over all the words pairs than
|
||||
// to compute the cartesian product of every words of the deleted documents.
|
||||
let mut iter =
|
||||
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let bytes = bytes.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&bytes, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word level position docids.
|
||||
let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let bytes = bytes.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&bytes, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We delete the documents ids that are under the word prefix level position docids.
|
||||
let mut iter =
|
||||
word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let bytes = bytes.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&bytes, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
Self::delete_from_db(
|
||||
word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
Self::delete_from_db(
|
||||
word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
Self::delete_from_db(
|
||||
word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
Self::delete_from_db(
|
||||
word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
Self::delete_from_db(
|
||||
word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
// Remove the documents ids from the field id word count database.
|
||||
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
|
||||
while let Some((key, mut docids)) = iter.next().transpose()? {
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&key, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
Self::delete_from_db(
|
||||
field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
|
||||
let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
|
||||
|
@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
}
|
||||
|
||||
// Remove the documents ids from the script language database.
|
||||
let mut iter = script_language_docids.iter_mut(self.wtxn)?;
|
||||
while let Some((key, mut docids)) = iter.next().transpose()? {
|
||||
let previous_len = docids.len();
|
||||
docids -= &self.to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&key, &docids)? };
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
Self::delete_from_db(
|
||||
script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
// We delete the documents ids that are under the facet field id values.
|
||||
remove_docids_from_facet_id_exists_docids(
|
||||
self.wtxn,
|
||||
|
@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||
soft_deletion_used: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn delete_from_db<C>(
|
||||
mut iter: RwIter<UnalignedSlice<u8>, C>,
|
||||
to_delete_docids: &RoaringBitmap,
|
||||
) -> Result<()>
|
||||
where
|
||||
C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
|
||||
+ for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
|
||||
{
|
||||
while let Some(result) = iter.next() {
|
||||
let (bytes, mut docids) = result?;
|
||||
let previous_len = docids.len();
|
||||
docids -= to_delete_docids;
|
||||
if docids.is_empty() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
} else if docids.len() != previous_len {
|
||||
let bytes = bytes.to_owned();
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(&bytes, &docids)? };
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_from_word_prefix_docids(
|
||||
|
|
|
@ -7,14 +7,17 @@ use super::helpers::{
|
|||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{DocumentId, Result};
|
||||
use crate::{
|
||||
absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
|
||||
DocumentId, Result,
|
||||
};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words at positions and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
|
@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||
for position in read_u32_ne_bytes(value) {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
let (fid, position) = relative_from_absolute_position(position);
|
||||
let position = bucketed_position(position);
|
||||
let position = absolute_from_relative_position(fid, position);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
|
||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(word_position_docids_sorter, indexer)
|
||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_position_docids_reader)
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
|||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use self::extract_word_position_docids::extract_word_fid_and_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||
GrenadParameters, MergeFn, MergeableReader,
|
||||
|
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||
docid_word_positions_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
extract_word_fid_and_position_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
|
|
|
@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode};
|
|||
use log::debug;
|
||||
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU32Codec;
|
||||
use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||
CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
use crate::{bucketed_position, relative_from_absolute_position, Index, Result};
|
||||
|
||||
pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
|
@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
|||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
|
||||
let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
||||
let (_fid, pos) = relative_from_absolute_position(pos);
|
||||
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
|
||||
|
@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
|||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
|
||||
.remap_key_type::<StrBEU32Codec>();
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in iter {
|
||||
let ((word, pos), data) = result?;
|
||||
if word.starts_with(prefix) {
|
||||
let key = (prefix, pos);
|
||||
let bytes = StrBEU32Codec::bytes_encode(&key).unwrap();
|
||||
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
|
||||
prefix_position_docids_sorter.insert(bytes, data)?;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue