mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
feat(index): store fields distribution in index
This commit is contained in:
parent
67e25f8724
commit
27c7ab6e00
5 changed files with 45 additions and 26 deletions
|
@ -14,12 +14,14 @@ use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId};
|
|||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use super::merge_function::merge_two_obkvs;
|
||||
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
||||
use crate::index::FieldsDistribution;
|
||||
|
||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||
|
||||
pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
pub fields_ids_map: FieldsIdsMap,
|
||||
pub fields_distribution: FieldsDistribution,
|
||||
pub external_documents_ids: ExternalDocumentsIds<'static>,
|
||||
pub new_documents_ids: RoaringBitmap,
|
||||
pub replaced_documents_ids: RoaringBitmap,
|
||||
|
@ -74,6 +76,7 @@ impl Transform<'_, '_> {
|
|||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
// Deserialize the whole batch of documents in memory.
|
||||
|
@ -103,6 +106,7 @@ impl Transform<'_, '_> {
|
|||
return Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
fields_distribution,
|
||||
external_documents_ids: ExternalDocumentsIds::default(),
|
||||
new_documents_ids: RoaringBitmap::new(),
|
||||
replaced_documents_ids: RoaringBitmap::new(),
|
||||
|
@ -148,6 +152,8 @@ impl Transform<'_, '_> {
|
|||
// We prepare the fields ids map with the documents keys.
|
||||
for (key, _value) in &document {
|
||||
fields_ids_map.insert(&key).context("field id limit reached")?;
|
||||
|
||||
*fields_distribution.entry(key.to_owned()).or_default() += 1;
|
||||
}
|
||||
|
||||
// We retrieve the user id from the document based on the primary key name,
|
||||
|
@ -200,6 +206,7 @@ impl Transform<'_, '_> {
|
|||
sorter,
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
fields_distribution,
|
||||
documents_count,
|
||||
external_documents_ids,
|
||||
progress_callback,
|
||||
|
@ -212,6 +219,7 @@ impl Transform<'_, '_> {
|
|||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
let mut csv = csv::Reader::from_reader(reader);
|
||||
|
@ -307,6 +315,10 @@ impl Transform<'_, '_> {
|
|||
json_buffer.clear();
|
||||
serde_json::to_writer(&mut json_buffer, &field)?;
|
||||
writer.insert(*field_id, &json_buffer)?;
|
||||
|
||||
let field_name = fields_ids_map.name(*field_id).unwrap();
|
||||
|
||||
*fields_distribution.entry(field_name.to_string()).or_default() += 1;
|
||||
}
|
||||
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
|
@ -328,6 +340,7 @@ impl Transform<'_, '_> {
|
|||
sorter,
|
||||
primary_key_name,
|
||||
fields_ids_map,
|
||||
fields_distribution,
|
||||
documents_count,
|
||||
external_documents_ids,
|
||||
progress_callback,
|
||||
|
@ -342,6 +355,7 @@ impl Transform<'_, '_> {
|
|||
sorter: grenad::Sorter<MergeFn>,
|
||||
primary_key: String,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
fields_distribution: FieldsDistribution,
|
||||
approximate_number_of_documents: usize,
|
||||
mut external_documents_ids: ExternalDocumentsIds<'_>,
|
||||
progress_callback: F,
|
||||
|
@ -439,6 +453,7 @@ impl Transform<'_, '_> {
|
|||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
fields_distribution,
|
||||
external_documents_ids: external_documents_ids.into_static(),
|
||||
new_documents_ids,
|
||||
replaced_documents_ids,
|
||||
|
@ -457,6 +472,7 @@ impl Transform<'_, '_> {
|
|||
new_fields_ids_map: FieldsIdsMap,
|
||||
) -> anyhow::Result<TransformOutput>
|
||||
{
|
||||
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
let documents_count = documents_ids.len() as usize;
|
||||
|
@ -492,6 +508,7 @@ impl Transform<'_, '_> {
|
|||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: new_fields_ids_map,
|
||||
fields_distribution,
|
||||
external_documents_ids: external_documents_ids.into_static(),
|
||||
new_documents_ids: documents_ids,
|
||||
replaced_documents_ids: RoaringBitmap::default(),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue