mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Use the GlobalFieldsIdsMap everywhere and write it to disk
Co-authored-by: Dureuill <louis@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
parent
c50d3edc4a
commit
c1557734dc
11 changed files with 70 additions and 99 deletions
|
@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
|
|||
use crate::FieldId;
|
||||
|
||||
mod global;
|
||||
pub use global::GlobalFieldsIdsMap;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FieldsIdsMap {
|
||||
|
|
|
@ -4,11 +4,13 @@ use std::sync::RwLock;
|
|||
use crate::{FieldId, FieldsIdsMap};
|
||||
|
||||
/// A fields ids map that can be globally updated to add fields
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GlobalFieldsIdsMap<'indexing> {
|
||||
global: &'indexing RwLock<FieldsIdsMap>,
|
||||
local: LocalFieldsIdsMap,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct LocalFieldsIdsMap {
|
||||
names_ids: BTreeMap<String, FieldId>,
|
||||
ids_names: BTreeMap<FieldId, String>,
|
||||
|
|
|
@ -55,7 +55,7 @@ pub use self::error::{
|
|||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
|
||||
pub use self::heed_codec::{
|
||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
||||
|
|
|
@ -1,32 +1,20 @@
|
|||
use std::fs::File;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
use grenad::Merger;
|
||||
use grenad::ReaderCursor;
|
||||
use grenad::{Merger, ReaderCursor};
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::IntoParallelIterator;
|
||||
use rayon::iter::ParallelBridge;
|
||||
use rayon::iter::ParallelIterator;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
|
||||
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{
|
||||
update::{
|
||||
create_sorter,
|
||||
new::{DocumentChange, ItemsPool},
|
||||
GrenadParameters,
|
||||
},
|
||||
FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
use super::{
|
||||
cache::{CachedSorter, DelAddRoaringBitmapMerger},
|
||||
tokenize_document::DocumentTokenizer,
|
||||
};
|
||||
use super::cache::CachedSorter;
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use crate::update::new::{DocumentChange, ItemsPool};
|
||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub trait SearchableExtractor {
|
||||
fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
||||
|
@ -62,12 +50,13 @@ pub trait SearchableExtractor {
|
|||
Ok((
|
||||
index.read_txn()?,
|
||||
&document_tokenizer,
|
||||
fields_ids_map.clone(),
|
||||
CachedSorter::new(
|
||||
// TODO use a better value
|
||||
100.try_into().unwrap(),
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
DelAddRoaringBitmapMerger,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
|
@ -78,12 +67,12 @@ pub trait SearchableExtractor {
|
|||
});
|
||||
|
||||
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||
context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| {
|
||||
context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
|
||||
Self::extract_document_change(
|
||||
&*rtxn,
|
||||
index,
|
||||
document_tokenizer,
|
||||
&fields_ids_map,
|
||||
fields_ids_map,
|
||||
cached_sorter,
|
||||
document_change?,
|
||||
)
|
||||
|
@ -91,7 +80,7 @@ pub trait SearchableExtractor {
|
|||
})?;
|
||||
|
||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for (_rtxn, _tokenizer, cache) in context_pool.into_items() {
|
||||
for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() {
|
||||
let sorter = cache.into_sorter()?;
|
||||
let readers = sorter.into_reader_cursors()?;
|
||||
builder.extend(readers);
|
||||
|
@ -104,8 +93,8 @@ pub trait SearchableExtractor {
|
|||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()>;
|
||||
}
|
||||
|
@ -116,9 +105,8 @@ impl SearchableExtractor for WordDocidsExtractor {
|
|||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
// TODO: DelAddRoaringBitmapMerger should be CBO
|
||||
cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||
document_change: DocumentChange,
|
||||
) -> crate::Result<()> {
|
||||
match document_change {
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
use crate::{
|
||||
update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError,
|
||||
LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use heed::RoTxn;
|
||||
use serde_json::Value;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{
|
||||
FieldId, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule,
|
||||
Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
|
||||
pub struct DocumentTokenizer<'a> {
|
||||
pub tokenizer: &'a Tokenizer<'a>,
|
||||
|
@ -18,18 +21,24 @@ impl<'a> DocumentTokenizer<'a> {
|
|||
pub fn tokenize_document(
|
||||
&self,
|
||||
obkv: &KvReaderFieldId,
|
||||
field_id_map: &FieldsIdsMap,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
token_fn: &mut impl FnMut(FieldId, u16, &str),
|
||||
) -> Result<()> {
|
||||
let mut field_position = HashMap::new();
|
||||
let mut field_name = String::new();
|
||||
for (field_id, field_bytes) in obkv {
|
||||
let Some(field_name) = field_id_map.name(field_id) else {
|
||||
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
||||
field_name.clear();
|
||||
field_name.push_str(s);
|
||||
&field_name
|
||||
}) else {
|
||||
unreachable!("field id not found in field id map");
|
||||
};
|
||||
|
||||
let mut tokenize_field = |name: &str, value: &Value| {
|
||||
let Some(field_id) = field_id_map.id(name) else {
|
||||
unreachable!("field name not found in field id map");
|
||||
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
||||
/// TODO: better error
|
||||
panic!("it's over 9000");
|
||||
};
|
||||
|
||||
let position =
|
||||
|
@ -75,7 +84,7 @@ impl<'a> DocumentTokenizer<'a> {
|
|||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if self.searchable_attributes.map_or(true, |attributes| {
|
||||
attributes.iter().any(|name| perm_json_p::contained_in(name, field_name))
|
||||
attributes.iter().any(|name| perm_json_p::contained_in(name, &field_name))
|
||||
}) {
|
||||
// parse json.
|
||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||
|
@ -224,11 +233,12 @@ mod perm_json_p {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use charabia::TokenizerBuilder;
|
||||
use meili_snap::snapshot;
|
||||
use obkv::KvReader;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_tokenize_document() {
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
|
|
|
@ -27,6 +27,7 @@ impl<'p> DocumentChanges<'p> for DocumentDeletion {
|
|||
|
||||
fn document_changes(
|
||||
self,
|
||||
_fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
let (index, fields, primary_key) = param;
|
||||
|
|
|
@ -73,13 +73,14 @@ impl DocumentOperation {
|
|||
}
|
||||
|
||||
impl<'p> DocumentChanges<'p> for DocumentOperation {
|
||||
type Parameter = (&'p Index, &'p RoTxn<'p>, &'p mut FieldsIdsMap, &'p PrimaryKey<'p>);
|
||||
type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>);
|
||||
|
||||
fn document_changes(
|
||||
self,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
let (index, rtxn, fields_ids_map, primary_key) = param;
|
||||
let (index, rtxn, primary_key) = param;
|
||||
|
||||
let documents_ids = index.documents_ids(rtxn)?;
|
||||
let mut available_docids = AvailableIds::new(&documents_ids);
|
||||
|
@ -174,7 +175,7 @@ impl<'p> DocumentChanges<'p> for DocumentOperation {
|
|||
|
||||
/// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
|
||||
let fields_ids_map = fields_ids_map.clone();
|
||||
// We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
let docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
|
||||
|
||||
Ok(docids_version_offsets
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use std::fs::File;
|
||||
use std::sync::RwLock;
|
||||
use std::thread::{self, Builder};
|
||||
|
||||
use big_s::S;
|
||||
|
@ -22,7 +23,7 @@ use crate::documents::{
|
|||
obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{Index, Result, UserError};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||
|
||||
mod document_deletion;
|
||||
mod document_operation;
|
||||
|
@ -34,6 +35,7 @@ pub trait DocumentChanges<'p> {
|
|||
|
||||
fn document_changes(
|
||||
self,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
|
||||
}
|
||||
|
@ -46,6 +48,7 @@ pub trait DocumentChanges<'p> {
|
|||
pub fn index<PI>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
pool: &ThreadPool,
|
||||
document_changes: PI,
|
||||
) -> Result<()>
|
||||
|
@ -57,6 +60,9 @@ where
|
|||
let ExtractorsMergerChannels { merger_receiver, deladd_cbo_roaring_bitmap_sender } =
|
||||
extractors_merger_channels(100);
|
||||
|
||||
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||
|
||||
thread::scope(|s| {
|
||||
// TODO manage the errors correctly
|
||||
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || {
|
||||
|
@ -65,7 +71,7 @@ where
|
|||
// word docids
|
||||
let merger = WordDocidsExtractor::run_extraction(
|
||||
index,
|
||||
todo!(),
|
||||
&global_fields_ids_map,
|
||||
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
||||
GrenadParameters::default(),
|
||||
document_changes.clone(),
|
||||
|
@ -100,8 +106,13 @@ where
|
|||
handle.join().unwrap()?;
|
||||
handle2.join().unwrap()?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
Ok(()) as Result<_>
|
||||
})?;
|
||||
|
||||
let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
|
||||
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// TODO move this elsewhere
|
||||
|
|
|
@ -30,6 +30,7 @@ where
|
|||
/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
|
||||
fn document_changes(
|
||||
self,
|
||||
_fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
let (fields_ids_map, concurrent_available_ids, primary_key) = param;
|
||||
|
|
|
@ -2,7 +2,7 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
|||
|
||||
use super::DocumentChanges;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::Result;
|
||||
use crate::{FieldsIdsMap, Result};
|
||||
|
||||
pub struct UpdateByFunction;
|
||||
|
||||
|
@ -11,6 +11,7 @@ impl<'p> DocumentChanges<'p> for UpdateByFunction {
|
|||
|
||||
fn document_changes(
|
||||
self,
|
||||
_fields_ids_map: &mut FieldsIdsMap,
|
||||
_param: Self::Parameter,
|
||||
) -> Result<impl ParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
Ok((0..100).into_par_iter().map(|_| todo!()))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue