mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-25 14:10:06 +01:00
introduce exact_word_docids db
This commit is contained in:
parent
5f9f82757d
commit
0a77be4ec0
@ -59,6 +59,7 @@ pub mod main_key {
|
|||||||
pub mod db_name {
|
pub mod db_name {
|
||||||
pub const MAIN: &str = "main";
|
pub const MAIN: &str = "main";
|
||||||
pub const WORD_DOCIDS: &str = "word-docids";
|
pub const WORD_DOCIDS: &str = "word-docids";
|
||||||
|
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
||||||
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
||||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||||
@ -83,6 +84,10 @@ pub struct Index {
|
|||||||
|
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
|
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||||
|
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
@ -119,12 +124,13 @@ impl Index {
|
|||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(14);
|
options.max_dbs(15);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let main = env.create_poly_database(Some(MAIN))?;
|
let main = env.create_poly_database(Some(MAIN))?;
|
||||||
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
let word_docids = env.create_database(Some(WORD_DOCIDS))?;
|
||||||
|
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
|
||||||
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
|
||||||
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
|
||||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
@ -146,6 +152,7 @@ impl Index {
|
|||||||
env,
|
env,
|
||||||
main,
|
main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
|
@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
env: _env,
|
env: _env,
|
||||||
main: _main,
|
main: _main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
word_prefix_docids.clear(self.wtxn)?;
|
word_prefix_docids.clear(self.wtxn)?;
|
||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
|
@ -2,7 +2,7 @@ use std::collections::btree_map::Entry;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::{ByteSlice, Str};
|
||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@ -16,7 +16,10 @@ use crate::heed_codec::facet::{
|
|||||||
};
|
};
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
use crate::{
|
||||||
|
DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
|
||||||
|
BEU32,
|
||||||
|
};
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
env: _env,
|
env: _env,
|
||||||
main: _main,
|
main: _main,
|
||||||
word_docids,
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
word_prefix_docids,
|
word_prefix_docids,
|
||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// We iterate over the words and delete the documents ids
|
// We iterate over the words and delete the documents ids
|
||||||
// from the word docids database.
|
// from the word docids database.
|
||||||
for (word, must_remove) in &mut words {
|
for (word, must_remove) in &mut words {
|
||||||
// We create an iterator to be able to get the content and delete the word docids.
|
remove_from_word_docids(
|
||||||
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
self.wtxn,
|
||||||
// the LMDB B-Tree two times but only once.
|
word_docids,
|
||||||
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
|
word.as_str(),
|
||||||
if let Some((key, mut docids)) = iter.next().transpose()? {
|
must_remove,
|
||||||
if key == word.as_str() {
|
&self.documents_ids,
|
||||||
let previous_len = docids.len();
|
)?;
|
||||||
docids -= &self.documents_ids;
|
|
||||||
if docids.is_empty() {
|
remove_from_word_docids(
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
self.wtxn,
|
||||||
unsafe { iter.del_current()? };
|
exact_word_docids,
|
||||||
*must_remove = true;
|
word.as_str(),
|
||||||
} else if docids.len() != previous_len {
|
must_remove,
|
||||||
let key = key.to_owned();
|
&self.documents_ids,
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
)?;
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We construct an FST set that contains the words to delete from the words FST.
|
// We construct an FST set that contains the words to delete from the words FST.
|
||||||
@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_from_word_docids(
|
||||||
|
txn: &mut heed::RwTxn,
|
||||||
|
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||||
|
word: &str,
|
||||||
|
must_remove: &mut bool,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
) -> Result<()> {
|
||||||
|
// We create an iterator to be able to get the content and delete the word docids.
|
||||||
|
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
||||||
|
// the LMDB B-Tree two times but only once.
|
||||||
|
let mut iter = db.prefix_iter_mut(txn, &word)?;
|
||||||
|
if let Some((key, mut docids)) = iter.next().transpose()? {
|
||||||
|
if key == word {
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_remove;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
*must_remove = true;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
let key = key.to_owned();
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&key, &docids)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
|
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
|
||||||
wtxn: &'a mut heed::RwTxn,
|
wtxn: &'a mut heed::RwTxn,
|
||||||
db: &heed::Database<C, DC>,
|
db: &heed::Database<C, DC>,
|
||||||
|
@ -10,17 +10,21 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
|
use crate::update::index_documents::MergeFn;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// Extracts the word and the documents ids where this word appear.
|
/// Extracts the word and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
/// Returns a grenad reader with the list of extracted words and
|
/// Returns a grenad reader with the list of extracted words and
|
||||||
/// documents ids from the given chunk of docid word positions.
|
/// documents ids from the given chunk of docid word positions.
|
||||||
|
///
|
||||||
|
/// The first returned reader in the one for normal word_docids, and the second one is for
|
||||||
|
/// exact_word_docids
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_docids_sorter = create_sorter(
|
||||||
@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(word_docids_sorter, indexer)
|
let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
|
||||||
|
Ok((
|
||||||
|
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||||
|
sorter_into_reader(empty_sorter, indexer)?,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_docids,
|
extract_word_docids,
|
||||||
merge_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
TypedChunk::WordDocids,
|
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
},
|
||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database(
|
|||||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||||
|
pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
|
Ok(Cow::Owned(Vec::new()))
|
||||||
|
}
|
||||||
|
@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto};
|
|||||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
pub use grenad_helpers::{
|
pub use grenad_helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
|
||||||
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
||||||
GrenadParameters, MergeableReader,
|
GrenadParameters, MergeableReader,
|
||||||
};
|
};
|
||||||
|
@ -20,7 +20,7 @@ pub use self::helpers::{
|
|||||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
|
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
use crate::documents::DocumentBatchReader;
|
use crate::documents::DocumentBatchReader;
|
||||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||||
@ -282,6 +282,7 @@ where
|
|||||||
let mut word_pair_proximity_docids = None;
|
let mut word_pair_proximity_docids = None;
|
||||||
let mut word_position_docids = None;
|
let mut word_position_docids = None;
|
||||||
let mut word_docids = None;
|
let mut word_docids = None;
|
||||||
|
let mut _exact_word_docids = None;
|
||||||
|
|
||||||
let mut databases_seen = 0;
|
let mut databases_seen = 0;
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
@ -291,10 +292,13 @@ where
|
|||||||
|
|
||||||
for result in lmdb_writer_rx {
|
for result in lmdb_writer_rx {
|
||||||
let typed_chunk = match result? {
|
let typed_chunk = match result? {
|
||||||
TypedChunk::WordDocids(chunk) => {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
word_docids = Some(cloneable_chunk);
|
word_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordDocids(chunk)
|
let cloneable_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
|
_exact_word_docids = Some(cloneable_chunk);
|
||||||
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
@ -425,6 +429,10 @@ where
|
|||||||
});
|
});
|
||||||
|
|
||||||
if let Some(word_docids) = word_docids {
|
if let Some(word_docids) = word_docids {
|
||||||
|
let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
|
||||||
|
word_docids_builder.push(word_docids.into_cursor()?);
|
||||||
|
// TODO: push exact_word_docids
|
||||||
|
let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
|
||||||
// Run the word prefix docids update operation.
|
// Run the word prefix docids update operation.
|
||||||
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
||||||
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
||||||
@ -432,7 +440,7 @@ where
|
|||||||
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
|
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
|
||||||
builder.max_memory = self.indexer_config.max_memory;
|
builder.max_memory = self.indexer_config.max_memory;
|
||||||
builder.execute(
|
builder.execute(
|
||||||
word_docids,
|
word_docids_iter,
|
||||||
&new_prefix_fst_words,
|
&new_prefix_fst_words,
|
||||||
&common_prefix_fst_words,
|
&common_prefix_fst_words,
|
||||||
&del_prefix_fst_words,
|
&del_prefix_fst_words,
|
||||||
|
@ -3,14 +3,16 @@ use std::convert::TryInto;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use grenad::MergerBuilder;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::{BytesDecode, RwTxn};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
|
self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
|
||||||
CursorClonableMmap,
|
CursorClonableMmap,
|
||||||
};
|
};
|
||||||
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
||||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||||
use crate::{
|
use crate::{
|
||||||
@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
|
|||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdWordcountDocids(grenad::Reader<File>),
|
FieldIdWordcountDocids(grenad::Reader<File>),
|
||||||
NewDocumentsIds(RoaringBitmap),
|
NewDocumentsIds(RoaringBitmap),
|
||||||
WordDocids(grenad::Reader<File>),
|
WordDocids {
|
||||||
|
word_docids_reader: grenad::Reader<File>,
|
||||||
|
exact_word_docids_reader: grenad::Reader<File>,
|
||||||
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<File>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||||
return Ok((documents_ids, is_merged_database))
|
return Ok((documents_ids, is_merged_database))
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids(word_docids_iter) => {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?;
|
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_docids_iter.clone(),
|
word_docids_iter.clone(),
|
||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
merge_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
|
append_entries_into_database(
|
||||||
|
exact_word_docids_iter.clone(),
|
||||||
|
&index.exact_word_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
|
||||||
let mut cursor = word_docids_iter.into_cursor()?;
|
|
||||||
while let Some((word, _value)) = cursor.move_on_next()? {
|
|
||||||
// This is a lexicographically ordered word position
|
|
||||||
// we use the key to construct the words fst.
|
|
||||||
builder.insert(word)?;
|
|
||||||
}
|
|
||||||
let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
|
|
||||||
let db_fst = index.words_fst(wtxn)?;
|
let db_fst = index.words_fst(wtxn)?;
|
||||||
|
|
||||||
// merge new fst with database fst
|
// merge new fst with database fst
|
||||||
@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
Ok((RoaringBitmap::new(), is_merged_database))
|
Ok((RoaringBitmap::new(), is_merged_database))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn merge_word_docids_reader_into_fst(
|
||||||
|
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
|
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||||
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
|
let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
|
||||||
|
merger_builder.push(word_docids_iter.into_cursor()?);
|
||||||
|
merger_builder.push(exact_word_docids_iter.into_cursor()?);
|
||||||
|
let mut iter = merger_builder.build().into_stream_merger_iter()?;
|
||||||
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
|
||||||
|
while let Some((k, _)) = iter.next()? {
|
||||||
|
builder.insert(k)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(builder.into_set())
|
||||||
|
}
|
||||||
|
|
||||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
||||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
||||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
||||||
|
@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
#[logging_timer::time("WordPrefixDocids::{}")]
|
#[logging_timer::time("WordPrefixDocids::{}")]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
new_word_docids: grenad::Reader<CursorClonableMmap>,
|
mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if !common_prefix_fst_words.is_empty() {
|
if !common_prefix_fst_words.is_empty() {
|
||||||
let mut new_word_docids_iter = new_word_docids.into_cursor()?;
|
|
||||||
let mut current_prefixes: Option<&&[String]> = None;
|
let mut current_prefixes: Option<&&[String]> = None;
|
||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
while let Some((word, data)) = new_word_docids_iter.next()? {
|
||||||
current_prefixes = match current_prefixes.take() {
|
current_prefixes = match current_prefixes.take() {
|
||||||
Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
|
Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
|
||||||
_otherwise => {
|
_otherwise => {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user