mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-05 19:23:31 +01:00
Generalize usage of CboRoaringBitmap codec to ease the use
This commit is contained in:
parent
8fb96b8274
commit
9078e60024
@ -119,16 +119,16 @@ pub struct Index {
|
||||
pub(crate) main: PolyDatabase,
|
||||
|
||||
/// A word and all the documents ids containing the word.
|
||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A prefix of word and all the documents ids containing this prefix.
|
||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
|
@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
|
@ -495,7 +495,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
|
||||
fn remove_from_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &Database<Str, RoaringBitmapCodec>,
|
||||
db: &Database<Str, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||
@ -523,7 +523,7 @@ fn remove_from_word_prefix_docids(
|
||||
|
||||
fn remove_from_word_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||
db: &heed::Database<Str, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
words_to_keep: &mut BTreeSet<String>,
|
||||
words_to_remove: &mut BTreeSet<String>,
|
||||
|
@ -106,6 +106,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
|
@ -8,7 +8,7 @@ use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap,
|
||||
create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap,
|
||||
sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
@ -36,15 +36,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut current_document_id = None;
|
||||
let mut fid = 0;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut words = BTreeSet::new();
|
||||
@ -55,28 +52,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
fid = u16::from_be_bytes(fid_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
// drain the btreemaps when we change document.
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&mut value_buffer,
|
||||
&mut words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
for (_pos, word) in KvReaderU16::new(&value).iter() {
|
||||
words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// We must make sure that don't lose the current document field id
|
||||
if let Some(document_id) = current_document_id {
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
@ -85,11 +66,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
&mut words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
words.clear();
|
||||
}
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@ -98,7 +81,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@ -142,15 +125,13 @@ fn words_into_sorter(
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||
serialize_roaring_bitmap(&bitmap, value_buffer)?;
|
||||
|
||||
for word_bytes in words.iter() {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?;
|
||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
|
||||
words.clear();
|
||||
|
@ -56,7 +56,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
current_document_id.unwrap(),
|
||||
&word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorter,
|
||||
)?;
|
||||
@ -64,6 +64,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
word_positions.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
for (position, word) in KvReaderU16::new(&value).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
|
@ -35,7 +35,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
|
||||
let mut current_document_id = None;
|
||||
let mut current_document_id: Option<u32> = None;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
@ -49,7 +49,8 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
word_position_docids_sorter
|
||||
.insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?;
|
||||
}
|
||||
word_positions.clear();
|
||||
}
|
||||
|
@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
|
@ -38,7 +38,7 @@ use crate::update::{
|
||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, Result, RoaringBitmapCodec};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||
@ -700,8 +700,8 @@ where
|
||||
fn execute_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
indexer_config: &IndexerConfig,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
|
@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||
|
@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
|
||||
use heed::Database;
|
||||
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||
CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{Result, RoaringBitmapCodec};
|
||||
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
|
||||
|
||||
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||
WordPrefixDocids {
|
||||
wtxn,
|
||||
@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.word_prefix_docids.as_polymorph(),
|
||||
prefix_docids_sorter,
|
||||
merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
|
Loading…
x
Reference in New Issue
Block a user