mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 06:44:27 +01:00
Merge #3819
3819: Remove the `docid_word_positions` database r=Kerollmops a=loiclec Remove the `docid_word_positions` database, which was only used during deletion operations. In the process, also fixes https://github.com/meilisearch/meilisearch/issues/3816 Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>
This commit is contained in:
commit
9dcf1da59d
@ -21,10 +21,9 @@ use crate::heed_codec::facet::{
|
|||||||
};
|
};
|
||||||
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||||
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
|
||||||
Search, U8StrStrCodec, BEU16, BEU32,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||||
@ -111,9 +110,6 @@ pub struct Index {
|
|||||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
|
||||||
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
|
|
||||||
|
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
||||||
@ -177,7 +173,6 @@ impl Index {
|
|||||||
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
|
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
|
||||||
let exact_word_prefix_docids =
|
let exact_word_prefix_docids =
|
||||||
env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
|
||||||
let docid_word_positions = env.create_database(&mut wtxn, Some(DOCID_WORD_POSITIONS))?;
|
|
||||||
let word_pair_proximity_docids =
|
let word_pair_proximity_docids =
|
||||||
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let script_language_docids =
|
let script_language_docids =
|
||||||
@ -220,7 +215,6 @@ impl Index {
|
|||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
script_language_docids,
|
script_language_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
@ -5,52 +5,6 @@
|
|||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||||
|
|
||||||
// #[cfg(test)]
|
|
||||||
// pub mod allocator {
|
|
||||||
// use std::alloc::{GlobalAlloc, System};
|
|
||||||
// use std::sync::atomic::{self, AtomicI64};
|
|
||||||
|
|
||||||
// #[global_allocator]
|
|
||||||
// pub static ALLOC: CountingAlloc = CountingAlloc {
|
|
||||||
// max_resident: AtomicI64::new(0),
|
|
||||||
// resident: AtomicI64::new(0),
|
|
||||||
// allocated: AtomicI64::new(0),
|
|
||||||
// };
|
|
||||||
|
|
||||||
// pub struct CountingAlloc {
|
|
||||||
// pub max_resident: AtomicI64,
|
|
||||||
// pub resident: AtomicI64,
|
|
||||||
// pub allocated: AtomicI64,
|
|
||||||
// }
|
|
||||||
// unsafe impl GlobalAlloc for CountingAlloc {
|
|
||||||
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
|
|
||||||
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
|
||||||
// let old_resident =
|
|
||||||
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
|
||||||
|
|
||||||
// let resident = old_resident + layout.size() as i64;
|
|
||||||
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
|
|
||||||
|
|
||||||
// // if layout.size() > 1_000_000 {
|
|
||||||
// // eprintln!(
|
|
||||||
// // "allocating {} with new resident size: {resident}",
|
|
||||||
// // layout.size() / 1_000_000
|
|
||||||
// // );
|
|
||||||
// // // let trace = std::backtrace::Backtrace::capture();
|
|
||||||
// // // let t = trace.to_string();
|
|
||||||
// // // eprintln!("{t}");
|
|
||||||
// // }
|
|
||||||
|
|
||||||
// System.alloc(layout)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
|
|
||||||
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
|
|
||||||
// System.dealloc(ptr, layout)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
pub mod documents;
|
pub mod documents;
|
||||||
|
|
||||||
|
@ -89,7 +89,6 @@ Create a snapshot test of the given database.
|
|||||||
- `exact_word_docids`
|
- `exact_word_docids`
|
||||||
- `word_prefix_docids`
|
- `word_prefix_docids`
|
||||||
- `exact_word_prefix_docids`
|
- `exact_word_prefix_docids`
|
||||||
- `docid_word_positions`
|
|
||||||
- `word_pair_proximity_docids`
|
- `word_pair_proximity_docids`
|
||||||
- `word_prefix_pair_proximity_docids`
|
- `word_prefix_pair_proximity_docids`
|
||||||
- `word_position_docids`
|
- `word_position_docids`
|
||||||
@ -217,11 +216,6 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
|
|||||||
&format!("{s:<16} {}", display_bitmap(&b))
|
&format!("{s:<16} {}", display_bitmap(&b))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
pub fn snap_docid_word_positions(index: &Index) -> String {
|
|
||||||
make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
|
|
||||||
&format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
||||||
make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
|
make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
|
||||||
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
||||||
@ -477,9 +471,6 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, exact_word_prefix_docids) => {{
|
($index:ident, exact_word_prefix_docids) => {{
|
||||||
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
|
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
|
||||||
}};
|
}};
|
||||||
($index:ident, docid_word_positions) => {{
|
|
||||||
$crate::snapshot_tests::snap_docid_word_positions(&$index)
|
|
||||||
}};
|
|
||||||
($index:ident, word_pair_proximity_docids) => {{
|
($index:ident, word_pair_proximity_docids) => {{
|
||||||
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
|
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -23,7 +23,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
prefix_word_pair_proximity_docids,
|
prefix_word_pair_proximity_docids,
|
||||||
@ -80,7 +79,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
exact_word_docids.clear(self.wtxn)?;
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
word_prefix_docids.clear(self.wtxn)?;
|
word_prefix_docids.clear(self.wtxn)?;
|
||||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||||
docid_word_positions.clear(self.wtxn)?;
|
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
@ -141,7 +139,6 @@ mod tests {
|
|||||||
|
|
||||||
assert!(index.word_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
|
|
||||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||||
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::collections::btree_map::Entry;
|
use std::collections::btree_map::Entry;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
|
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
|
||||||
@ -15,8 +15,7 @@ use crate::facet::FacetType;
|
|||||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::{
|
use crate::{
|
||||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
|
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
|
||||||
SmallString32, BEU32,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
@ -232,7 +231,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
docid_word_positions,
|
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
@ -251,23 +249,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
|
// Remove from the documents database
|
||||||
// Retrieve the words contained in the documents.
|
|
||||||
let mut words = Vec::new();
|
|
||||||
for docid in &self.to_delete_docids {
|
for docid in &self.to_delete_docids {
|
||||||
documents.delete(self.wtxn, &BEU32::new(docid))?;
|
documents.delete(self.wtxn, &BEU32::new(docid))?;
|
||||||
|
|
||||||
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
|
|
||||||
// We create an iterator to be able to get the content and delete the key-value itself.
|
|
||||||
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
|
|
||||||
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let ((_docid, word), _positions) = result?;
|
|
||||||
// This boolean will indicate if we must remove this word from the words FST.
|
|
||||||
words.push((SmallString32::from(word), false));
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// We acquire the current external documents ids map...
|
// We acquire the current external documents ids map...
|
||||||
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
|
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
|
||||||
@ -278,42 +262,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
let new_external_documents_ids = new_external_documents_ids.into_static();
|
let new_external_documents_ids = new_external_documents_ids.into_static();
|
||||||
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
|
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
|
||||||
|
|
||||||
// Maybe we can improve the get performance of the words
|
let mut words_to_keep = BTreeSet::default();
|
||||||
// if we sort the words first, keeping the LMDB pages in cache.
|
let mut words_to_delete = BTreeSet::default();
|
||||||
words.sort_unstable();
|
|
||||||
|
|
||||||
// We iterate over the words and delete the documents ids
|
// We iterate over the words and delete the documents ids
|
||||||
// from the word docids database.
|
// from the word docids database.
|
||||||
for (word, must_remove) in &mut words {
|
remove_from_word_docids(
|
||||||
remove_from_word_docids(
|
self.wtxn,
|
||||||
self.wtxn,
|
word_docids,
|
||||||
word_docids,
|
&self.to_delete_docids,
|
||||||
word.as_str(),
|
&mut words_to_keep,
|
||||||
must_remove,
|
&mut words_to_delete,
|
||||||
&self.to_delete_docids,
|
)?;
|
||||||
)?;
|
remove_from_word_docids(
|
||||||
|
self.wtxn,
|
||||||
remove_from_word_docids(
|
exact_word_docids,
|
||||||
self.wtxn,
|
&self.to_delete_docids,
|
||||||
exact_word_docids,
|
&mut words_to_keep,
|
||||||
word.as_str(),
|
&mut words_to_delete,
|
||||||
must_remove,
|
)?;
|
||||||
&self.to_delete_docids,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We construct an FST set that contains the words to delete from the words FST.
|
// We construct an FST set that contains the words to delete from the words FST.
|
||||||
let words_to_delete =
|
let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
|
||||||
words.iter().filter_map(
|
|
||||||
|(word, must_remove)| {
|
|
||||||
if *must_remove {
|
|
||||||
Some(word.as_str())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
},
|
|
||||||
);
|
|
||||||
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
|
|
||||||
|
|
||||||
let new_words_fst = {
|
let new_words_fst = {
|
||||||
// We retrieve the current words FST from the database.
|
// We retrieve the current words FST from the database.
|
||||||
@ -532,23 +501,24 @@ fn remove_from_word_prefix_docids(
|
|||||||
fn remove_from_word_docids(
|
fn remove_from_word_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||||
word: &str,
|
|
||||||
must_remove: &mut bool,
|
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
|
words_to_keep: &mut BTreeSet<String>,
|
||||||
|
words_to_remove: &mut BTreeSet<String>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// We create an iterator to be able to get the content and delete the word docids.
|
// We create an iterator to be able to get the content and delete the word docids.
|
||||||
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
||||||
// the LMDB B-Tree two times but only once.
|
// the LMDB B-Tree two times but only once.
|
||||||
let mut iter = db.prefix_iter_mut(txn, word)?;
|
let mut iter = db.iter_mut(txn)?;
|
||||||
if let Some((key, mut docids)) = iter.next().transpose()? {
|
while let Some((key, mut docids)) = iter.next().transpose()? {
|
||||||
if key == word {
|
let previous_len = docids.len();
|
||||||
let previous_len = docids.len();
|
docids -= to_remove;
|
||||||
docids -= to_remove;
|
if docids.is_empty() {
|
||||||
if docids.is_empty() {
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
unsafe { iter.del_current()? };
|
||||||
unsafe { iter.del_current()? };
|
words_to_remove.insert(key.to_owned());
|
||||||
*must_remove = true;
|
} else {
|
||||||
} else if docids.len() != previous_len {
|
words_to_keep.insert(key.to_owned());
|
||||||
|
if docids.len() != previous_len {
|
||||||
let key = key.to_owned();
|
let key = key.to_owned();
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
unsafe { iter.put_current(&key, &docids)? };
|
||||||
@ -627,7 +597,7 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::{db_snap, Filter};
|
use crate::{db_snap, Filter, Search};
|
||||||
|
|
||||||
fn delete_documents<'t>(
|
fn delete_documents<'t>(
|
||||||
wtxn: &mut RwTxn<'t, '_>,
|
wtxn: &mut RwTxn<'t, '_>,
|
||||||
@ -1199,4 +1169,52 @@ mod tests {
|
|||||||
DeletionStrategy::AlwaysSoft,
|
DeletionStrategy::AlwaysSoft,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delete_words_exact_attributes() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_primary_key(S("id"));
|
||||||
|
settings.set_searchable_fields(vec![S("text"), S("exact")]);
|
||||||
|
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "id": 0, "text": "hello" },
|
||||||
|
{ "id": 1, "exact": "hello"}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
db_snap!(index, word_docids, 1, @r###"
|
||||||
|
hello [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, exact_word_docids, 1, @r###"
|
||||||
|
hello [1, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let deleted_internal_ids =
|
||||||
|
delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_docids, 2, @r###"
|
||||||
|
hello [0, ]
|
||||||
|
"###);
|
||||||
|
db_snap!(index, exact_word_docids, 2, @"");
|
||||||
|
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("hello");
|
||||||
|
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -325,8 +325,6 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
// send docid_word_positions_chunk to DB writer
|
// send docid_word_positions_chunk to DB writer
|
||||||
let docid_word_positions_chunk =
|
let docid_word_positions_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||||
let _ = lmdb_writer_sx
|
|
||||||
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
|
|
||||||
|
|
||||||
let _ =
|
let _ =
|
||||||
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
|
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
|
||||||
|
@ -4,7 +4,6 @@ use std::result::Result as StdResult;
|
|||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::read_u32_ne_bytes;
|
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::update::index_documents::transform::Operation;
|
use crate::update::index_documents::transform::Operation;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@ -22,10 +21,6 @@ pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Co
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
|
|
||||||
read_u32_ne_bytes(slice).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.reserve(bitmap.serialized_size());
|
buffer.reserve(bitmap.serialized_size());
|
||||||
|
@ -14,8 +14,8 @@ pub use grenad_helpers::{
|
|||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
|
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
|
||||||
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
|
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
|
||||||
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
|
MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::MAX_WORD_LENGTH;
|
use crate::MAX_WORD_LENGTH;
|
||||||
|
@ -2513,6 +2513,5 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||||
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,24 +7,19 @@ use std::io;
|
|||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
use grenad::MergerBuilder;
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::RwTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap,
|
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||||
valid_lmdb_key, CursorClonableMmap,
|
|
||||||
};
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::{
|
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
|
||||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
|
||||||
Result,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub(crate) enum TypedChunk {
|
pub(crate) enum TypedChunk {
|
||||||
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
|
|
||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
@ -56,29 +51,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
) -> Result<(RoaringBitmap, bool)> {
|
) -> Result<(RoaringBitmap, bool)> {
|
||||||
let mut is_merged_database = false;
|
let mut is_merged_database = false;
|
||||||
match typed_chunk {
|
match typed_chunk {
|
||||||
TypedChunk::DocidWordPositions(docid_word_positions_iter) => {
|
|
||||||
write_entries_into_database(
|
|
||||||
docid_word_positions_iter,
|
|
||||||
&index.docid_word_positions,
|
|
||||||
wtxn,
|
|
||||||
index_is_empty,
|
|
||||||
|value, buffer| {
|
|
||||||
// ensure that values are unique and ordered
|
|
||||||
let positions = roaring_bitmap_from_u32s_array(value);
|
|
||||||
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
|
|
||||||
Ok(buffer)
|
|
||||||
},
|
|
||||||
|new_values, db_values, buffer| {
|
|
||||||
let new_values = roaring_bitmap_from_u32s_array(new_values);
|
|
||||||
let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) {
|
|
||||||
Some(db_values) => new_values | db_values,
|
|
||||||
None => new_values, // should not happen
|
|
||||||
};
|
|
||||||
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
TypedChunk::Documents(obkv_documents_iter) => {
|
TypedChunk::Documents(obkv_documents_iter) => {
|
||||||
let mut cursor = obkv_documents_iter.into_cursor()?;
|
let mut cursor = obkv_documents_iter.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
Loading…
Reference in New Issue
Block a user