MeiliSearch/milli/src/update/index_documents/extract/mod.rs

406 lines
16 KiB
Rust
Raw Normal View History

2021-08-16 13:36:30 +02:00
mod extract_docid_word_positions;
mod extract_facet_number_docids;
mod extract_facet_string_docids;
mod extract_fid_docid_facet_values;
mod extract_fid_word_count_docids;
2021-08-23 18:41:48 +02:00
mod extract_geo_points;
2023-06-08 11:35:36 +02:00
mod extract_vector_points;
2021-08-16 13:36:30 +02:00
mod extract_word_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
2021-08-16 13:36:30 +02:00
use std::collections::HashSet;
2021-08-16 13:36:30 +02:00
use std::fs::File;
use std::io::BufReader;
2021-08-16 13:36:30 +02:00
use crossbeam_channel::Sender;
use rayon::prelude::*;
use self::extract_docid_word_positions::extract_docid_word_positions;
use self::extract_facet_number_docids::extract_facet_number_docids;
use self::extract_facet_string_docids::extract_facet_string_docids;
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
2021-08-16 13:36:30 +02:00
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
2021-08-23 18:41:48 +02:00
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::{
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
};
2021-08-16 13:36:30 +02:00
use self::extract_word_docids::extract_word_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
2021-08-16 13:36:30 +02:00
use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::vector::EmbeddingConfigs;
use crate::{FieldId, FieldsIdsMap, Result};
2021-08-16 13:36:30 +02:00
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
#[allow(clippy::too_many_arguments)]
2024-01-23 09:42:48 +01:00
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
2021-08-16 13:36:30 +02:00
pub(crate) fn data_from_obkv_documents(
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
2021-08-16 13:36:30 +02:00
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
2021-08-16 13:36:30 +02:00
searchable_fields: Option<HashSet<FieldId>>,
faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId,
2022-03-23 17:28:41 +01:00
geo_fields_ids: Option<(FieldId, FieldId)>,
field_id_map: FieldsIdsMap,
stop_words: Option<fst::Set<Vec<u8>>>,
2023-08-10 10:44:07 +02:00
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
2022-03-24 17:00:29 +01:00
exact_attributes: HashSet<FieldId>,
proximity_precision: ProximityPrecision,
embedders: EmbeddingConfigs,
2021-08-16 13:36:30 +02:00
) -> Result<()> {
puffin::profile_function!();
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|| {
original_obkv_chunks
.par_bridge()
.map(|original_documents_chunk| {
send_original_documents_data(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
field_id_map.clone(),
embedders.clone(),
)
})
.collect::<Result<()>>()
2022-03-24 15:22:57 +01:00
},
|| {
flattened_obkv_chunks
.par_bridge()
.map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
&allowed_separators,
&dictionary,
max_positions_per_attributes,
)
})
.map(|result| {
if let Ok((
ref docid_word_positions_chunk,
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
)) = result
{
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
let exact_attributes = exact_attributes.clone();
run_extraction_task::<
_,
_,
(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
),
>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| {
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
},
|(
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
)| {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
},
"word-docids",
);
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_position_docids,
TypedChunk::WordPositionDocids,
"word-position-docids",
);
run_extraction_task::<
_,
_,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
>(
fid_docid_facet_strings_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
TypedChunk::FieldIdFacetStringDocids,
"field-id-facet-string-docids",
);
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
fid_docid_facet_numbers_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_facet_number_docids,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
if proximity_precision == ProximityPrecision::ByWord {
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
}
2021-08-16 13:36:30 +02:00
Ok(())
})
.collect::<Result<()>>()
},
2021-08-16 13:36:30 +02:00
);
original_pipeline_result.and(flattened_pipeline_result)
2021-08-16 13:36:30 +02:00
}
/// Spawn a new task to extract data for a specific DB using extract_fn.
/// Generated grenad chunks are merged using the merge_fn.
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
/// and sent into lmdb_writer_sx.
fn run_extraction_task<FE, FS, M>(
chunk: grenad::Reader<CursorClonableMmap>,
2021-08-16 13:36:30 +02:00
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
2021-08-16 13:36:30 +02:00
extract_fn: FE,
serialize_fn: FS,
name: &'static str,
) where
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
2021-08-16 13:36:30 +02:00
+ Sync
+ Send
+ 'static,
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
M: Send,
2021-08-16 13:36:30 +02:00
{
2024-01-23 09:42:48 +01:00
let current_span = tracing::Span::current();
2021-08-16 13:36:30 +02:00
rayon::spawn(move || {
2024-02-26 16:38:17 +01:00
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
2024-01-23 09:42:48 +01:00
let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer) {
Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
})
2021-08-16 13:36:30 +02:00
}
2022-03-23 17:28:41 +01:00
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
2022-03-23 17:28:41 +01:00
fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
2022-03-23 17:28:41 +01:00
lmdb_writer_sx: Sender<Result<TypedChunk>>,
field_id_map: FieldsIdsMap,
embedders: EmbeddingConfigs,
2022-03-23 17:28:41 +01:00
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = rayon::ThreadPoolBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
2024-03-20 13:25:10 +01:00
.build()?;
rayon::spawn(move || {
for (name, (embedder, prompt)) in embedders {
let result = extract_vector_points(
documents_chunk_cloned.clone(),
indexer,
&field_id_map,
&prompt,
&name,
);
match result {
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
let embeddings = match extract_embeddings(
prompts,
indexer,
embedder.clone(),
&request_threads,
) {
Ok(results) => Some(results),
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
None
}
};
if !(remove_vectors.is_empty()
&& manual_vectors.is_empty()
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
{
2023-12-07 17:35:45 +01:00
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension: embedder.dimensions(),
2023-12-07 17:35:45 +01:00
manual_vectors,
embedder_name: name,
2023-12-07 17:35:45 +01:00
}));
}
}
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
}
2023-12-07 17:35:45 +01:00
}
}
});
2022-03-23 17:28:41 +01:00
// TODO: create a custom internal error
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
2022-03-23 17:28:41 +01:00
Ok(())
}
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents_ids
/// - docid_word_positions
/// - docid_fid_facet_numbers
/// - docid_fid_facet_strings
/// - docid_fid_facet_exists
#[allow(clippy::too_many_arguments)]
#[allow(clippy::type_complexity)]
2022-03-23 17:28:41 +01:00
fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>,
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId,
2022-03-23 17:28:41 +01:00
geo_fields_ids: Option<(FieldId, FieldId)>,
stop_words: &Option<fst::Set<Vec<u8>>>,
2023-08-10 10:44:07 +02:00
allowed_separators: &Option<&[&str]>,
dictionary: &Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
)> {
2022-03-23 17:28:41 +01:00
let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
2022-03-23 17:28:41 +01:00
if let Some(geo_fields_ids) = geo_fields_ids {
let documents_chunk_cloned = flattened_documents_chunk.clone();
2021-08-30 15:47:11 +02:00
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result =
2022-03-23 17:28:41 +01:00
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
let _ = match result {
2021-08-30 15:47:11 +02:00
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
2021-08-25 16:59:38 +02:00
2023-10-18 13:53:58 +02:00
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {
2023-11-06 10:31:14 +01:00
let (docid_word_positions_chunk, script_language_pair) =
2022-10-17 13:51:04 +02:00
extract_docid_word_positions(
flattened_documents_chunk.clone(),
2023-02-21 10:18:44 +01:00
indexer,
2022-10-17 13:51:04 +02:00
searchable_fields,
stop_words.as_ref(),
2023-08-10 10:44:07 +02:00
*allowed_separators,
*dictionary,
2022-10-17 13:51:04 +02:00
max_positions_per_attributes,
)?;
// send docid_word_positions_chunk to DB writer
let docid_word_positions_chunk =
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
2022-10-17 13:51:04 +02:00
let _ =
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
2022-10-12 13:24:56 +02:00
Ok(docid_word_positions_chunk)
},
|| {
let ExtractedFacetValues {
2023-10-18 13:53:58 +02:00
fid_docid_facet_numbers_chunk,
fid_docid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
2023-03-14 18:08:12 +01:00
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk,
} = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(),
indexer,
faceted_fields,
geo_fields_ids,
)?;
2023-10-18 13:53:58 +02:00
// send fid_docid_facet_numbers_chunk to DB writer
let fid_docid_facet_numbers_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
2023-10-18 13:53:58 +02:00
fid_docid_facet_numbers_chunk.clone(),
)));
2023-10-18 13:53:58 +02:00
// send fid_docid_facet_strings_chunk to DB writer
let fid_docid_facet_strings_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
2023-10-18 13:53:58 +02:00
fid_docid_facet_strings_chunk.clone(),
)));
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
fid_facet_is_empty_docids_chunk,
)));
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
},
);
2021-09-02 15:17:52 +02:00
2023-10-18 13:53:58 +02:00
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
}