mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-09 22:48:54 +01:00
remove milli/src/update/new/extract/extract_word_docids.rs
This commit is contained in:
parent
6a399556b5
commit
781a186f75
@ -1,168 +0,0 @@
|
|||||||
use std::fs::File;
|
|
||||||
|
|
||||||
use charabia::TokenizerBuilder;
|
|
||||||
use grenad::Merger;
|
|
||||||
use heed::RoTxn;
|
|
||||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
|
||||||
|
|
||||||
use super::cache::CachedSorter;
|
|
||||||
use super::tokenize_document::DocumentTokenizer;
|
|
||||||
use crate::update::new::{DocumentChange, ItemsPool};
|
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
|
||||||
use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
|
||||||
|
|
||||||
pub trait SearchableExtractor {
|
|
||||||
fn run_extraction(
|
|
||||||
index: &Index,
|
|
||||||
fields_ids_map: &GlobalFieldsIdsMap,
|
|
||||||
indexer: GrenadParameters,
|
|
||||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
|
||||||
let stop_words = index.stop_words(&rtxn)?;
|
|
||||||
let allowed_separators = index.allowed_separators(&rtxn)?;
|
|
||||||
let allowed_separators: Option<Vec<_>> =
|
|
||||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
|
||||||
let dictionary = index.dictionary(&rtxn)?;
|
|
||||||
let dictionary: Option<Vec<_>> =
|
|
||||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
|
||||||
let builder = tokenizer_builder(
|
|
||||||
stop_words.as_ref(),
|
|
||||||
allowed_separators.as_deref(),
|
|
||||||
dictionary.as_deref(),
|
|
||||||
);
|
|
||||||
let tokenizer = builder.into_tokenizer();
|
|
||||||
|
|
||||||
let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?;
|
|
||||||
let localized_attributes_rules =
|
|
||||||
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
|
||||||
|
|
||||||
let document_tokenizer = DocumentTokenizer {
|
|
||||||
tokenizer: &tokenizer,
|
|
||||||
searchable_attributes: user_defined_searchable_fields.as_deref(),
|
|
||||||
localized_attributes_rules: &localized_attributes_rules,
|
|
||||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
|
||||||
};
|
|
||||||
|
|
||||||
let context_pool = ItemsPool::new(|| {
|
|
||||||
Ok((
|
|
||||||
index.read_txn()?,
|
|
||||||
&document_tokenizer,
|
|
||||||
fields_ids_map.clone(),
|
|
||||||
CachedSorter::new(
|
|
||||||
// TODO use a better value
|
|
||||||
100.try_into().unwrap(),
|
|
||||||
create_sorter(
|
|
||||||
grenad::SortAlgorithm::Stable,
|
|
||||||
MergeDeladdCboRoaringBitmaps,
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
))
|
|
||||||
});
|
|
||||||
|
|
||||||
document_changes.into_par_iter().try_for_each(|document_change| {
|
|
||||||
context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
|
|
||||||
Self::extract_document_change(
|
|
||||||
&*rtxn,
|
|
||||||
index,
|
|
||||||
document_tokenizer,
|
|
||||||
fields_ids_map,
|
|
||||||
cached_sorter,
|
|
||||||
document_change?,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
|
||||||
for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() {
|
|
||||||
let sorter = cache.into_sorter()?;
|
|
||||||
let readers = sorter.into_reader_cursors()?;
|
|
||||||
builder.extend(readers);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(builder.build())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_document_change(
|
|
||||||
rtxn: &RoTxn,
|
|
||||||
index: &Index,
|
|
||||||
document_tokenizer: &DocumentTokenizer,
|
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>,
|
|
||||||
document_change: DocumentChange,
|
|
||||||
) -> Result<()>;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct WordDocidsExtractor;
|
|
||||||
impl SearchableExtractor for WordDocidsExtractor {
|
|
||||||
fn extract_document_change(
|
|
||||||
rtxn: &RoTxn,
|
|
||||||
index: &Index,
|
|
||||||
document_tokenizer: &DocumentTokenizer,
|
|
||||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
|
||||||
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>,
|
|
||||||
document_change: DocumentChange,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
match document_change {
|
|
||||||
DocumentChange::Deletion(inner) => {
|
|
||||||
let mut token_fn = |_fid, _pos: u16, word: &str| {
|
|
||||||
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
|
|
||||||
};
|
|
||||||
document_tokenizer.tokenize_document(
|
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
|
||||||
fields_ids_map,
|
|
||||||
&mut token_fn,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
DocumentChange::Update(inner) => {
|
|
||||||
let mut token_fn = |_fid, _pos, word: &str| {
|
|
||||||
cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap();
|
|
||||||
};
|
|
||||||
document_tokenizer.tokenize_document(
|
|
||||||
inner.current(rtxn, index)?.unwrap(),
|
|
||||||
fields_ids_map,
|
|
||||||
&mut token_fn,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let mut token_fn = |_fid, _pos, word: &str| {
|
|
||||||
cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
|
|
||||||
};
|
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
|
||||||
}
|
|
||||||
DocumentChange::Insertion(inner) => {
|
|
||||||
let mut token_fn = |_fid, _pos, word: &str| {
|
|
||||||
cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap();
|
|
||||||
};
|
|
||||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Factorize tokenizer building.
|
|
||||||
fn tokenizer_builder<'a>(
|
|
||||||
stop_words: Option<&'a fst::Set<&'a [u8]>>,
|
|
||||||
allowed_separators: Option<&'a [&str]>,
|
|
||||||
dictionary: Option<&'a [&str]>,
|
|
||||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
|
||||||
if let Some(stop_words) = stop_words {
|
|
||||||
tokenizer_builder.stop_words(stop_words);
|
|
||||||
}
|
|
||||||
if let Some(dictionary) = dictionary {
|
|
||||||
tokenizer_builder.words_dict(dictionary);
|
|
||||||
}
|
|
||||||
if let Some(separators) = allowed_separators {
|
|
||||||
tokenizer_builder.separators(separators);
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenizer_builder
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user