mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
feat(index): store fields distribution in index
This commit is contained in:
parent
67e25f8724
commit
27c7ab6e00
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -1520,7 +1520,8 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "pest"
|
name = "pest"
|
||||||
version = "2.1.3"
|
version = "2.1.3"
|
||||||
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ucd-trie",
|
"ucd-trie",
|
||||||
]
|
]
|
||||||
@ -1528,8 +1529,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "pest"
|
name = "pest"
|
||||||
version = "2.1.3"
|
version = "2.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
|
||||||
checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ucd-trie",
|
"ucd-trie",
|
||||||
]
|
]
|
||||||
|
@ -23,6 +23,7 @@ pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
|||||||
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
|
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
|
||||||
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
|
||||||
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
|
||||||
|
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
|
||||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||||
@ -33,6 +34,8 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
|
|||||||
const CREATED_AT_KEY: &str = "created-at";
|
const CREATED_AT_KEY: &str = "created-at";
|
||||||
const UPDATED_AT_KEY: &str = "updated-at";
|
const UPDATED_AT_KEY: &str = "updated-at";
|
||||||
|
|
||||||
|
pub type FieldsDistribution = HashMap<String, u64>;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
/// The LMDB environment which this index is associated with.
|
/// The LMDB environment which this index is associated with.
|
||||||
@ -204,23 +207,18 @@ impl Index {
|
|||||||
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default())
|
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fields ids distribution */
|
/* fields distribution */
|
||||||
|
|
||||||
/// Returns the fields ids distribution which associate the internal field ids
|
/// Writes the fields distribution which associate the field with the number of times
|
||||||
/// with the number of times it occurs in the obkv documents.
|
/// it occurs in the obkv documents.
|
||||||
// TODO store in the index itself and change only within updates that modify the documents
|
pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
|
||||||
pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result<HashMap<FieldId, u64>> {
|
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution)
|
||||||
let mut distribution = HashMap::new();
|
|
||||||
|
|
||||||
for document in self.documents.iter(rtxn)? {
|
|
||||||
let (_, obkv) = document?;
|
|
||||||
|
|
||||||
for (field_id, _) in obkv.iter() {
|
|
||||||
*distribution.entry(field_id).or_default() += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(distribution)
|
/// Returns the fields distribution which associate the field with the number of times
|
||||||
|
/// it occurs in the obkv documents.
|
||||||
|
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
|
||||||
|
Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
/* displayed fields */
|
/* displayed fields */
|
||||||
@ -469,6 +467,7 @@ impl Index {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
|
use maplit::hashmap;
|
||||||
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::update::{IndexDocuments, UpdateFormat};
|
use crate::update::{IndexDocuments, UpdateFormat};
|
||||||
@ -493,16 +492,15 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn fields_ids_distribution() {
|
fn initial_fields_distribution() {
|
||||||
let index = prepare_index();
|
let index = prepare_index();
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
let fields_distribution = index.fields_distribution(&rtxn).unwrap();
|
||||||
|
assert_eq!(fields_distribution, hashmap!{
|
||||||
let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap();
|
"age".to_string() => 1,
|
||||||
assert_eq!(fields_ids_distribution.len(), 2);
|
"name".to_string() => 2
|
||||||
assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1));
|
});
|
||||||
assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -358,6 +358,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
let TransformOutput {
|
let TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
external_documents_ids,
|
external_documents_ids,
|
||||||
new_documents_ids,
|
new_documents_ids,
|
||||||
replaced_documents_ids,
|
replaced_documents_ids,
|
||||||
@ -551,6 +552,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
// We write the fields ids map into the main database
|
// We write the fields ids map into the main database
|
||||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||||
|
|
||||||
|
// We write the fields distribution into the main database
|
||||||
|
self.index.put_fields_distribution(self.wtxn, &fields_distribution)?;
|
||||||
|
|
||||||
// We write the primary key field id into the main database
|
// We write the primary key field id into the main database
|
||||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||||
|
|
||||||
|
@ -14,12 +14,14 @@ use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId};
|
|||||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||||
use super::merge_function::merge_two_obkvs;
|
use super::merge_function::merge_two_obkvs;
|
||||||
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
||||||
|
use crate::index::FieldsDistribution;
|
||||||
|
|
||||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||||
|
|
||||||
pub struct TransformOutput {
|
pub struct TransformOutput {
|
||||||
pub primary_key: String,
|
pub primary_key: String,
|
||||||
pub fields_ids_map: FieldsIdsMap,
|
pub fields_ids_map: FieldsIdsMap,
|
||||||
|
pub fields_distribution: FieldsDistribution,
|
||||||
pub external_documents_ids: ExternalDocumentsIds<'static>,
|
pub external_documents_ids: ExternalDocumentsIds<'static>,
|
||||||
pub new_documents_ids: RoaringBitmap,
|
pub new_documents_ids: RoaringBitmap,
|
||||||
pub replaced_documents_ids: RoaringBitmap,
|
pub replaced_documents_ids: RoaringBitmap,
|
||||||
@ -74,6 +76,7 @@ impl Transform<'_, '_> {
|
|||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
{
|
{
|
||||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||||
|
let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||||
|
|
||||||
// Deserialize the whole batch of documents in memory.
|
// Deserialize the whole batch of documents in memory.
|
||||||
@ -103,6 +106,7 @@ impl Transform<'_, '_> {
|
|||||||
return Ok(TransformOutput {
|
return Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
external_documents_ids: ExternalDocumentsIds::default(),
|
external_documents_ids: ExternalDocumentsIds::default(),
|
||||||
new_documents_ids: RoaringBitmap::new(),
|
new_documents_ids: RoaringBitmap::new(),
|
||||||
replaced_documents_ids: RoaringBitmap::new(),
|
replaced_documents_ids: RoaringBitmap::new(),
|
||||||
@ -148,6 +152,8 @@ impl Transform<'_, '_> {
|
|||||||
// We prepare the fields ids map with the documents keys.
|
// We prepare the fields ids map with the documents keys.
|
||||||
for (key, _value) in &document {
|
for (key, _value) in &document {
|
||||||
fields_ids_map.insert(&key).context("field id limit reached")?;
|
fields_ids_map.insert(&key).context("field id limit reached")?;
|
||||||
|
|
||||||
|
*fields_distribution.entry(key.to_owned()).or_default() += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We retrieve the user id from the document based on the primary key name,
|
// We retrieve the user id from the document based on the primary key name,
|
||||||
@ -200,6 +206,7 @@ impl Transform<'_, '_> {
|
|||||||
sorter,
|
sorter,
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
documents_count,
|
documents_count,
|
||||||
external_documents_ids,
|
external_documents_ids,
|
||||||
progress_callback,
|
progress_callback,
|
||||||
@ -212,6 +219,7 @@ impl Transform<'_, '_> {
|
|||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
{
|
{
|
||||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||||
|
let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||||
|
|
||||||
let mut csv = csv::Reader::from_reader(reader);
|
let mut csv = csv::Reader::from_reader(reader);
|
||||||
@ -307,6 +315,10 @@ impl Transform<'_, '_> {
|
|||||||
json_buffer.clear();
|
json_buffer.clear();
|
||||||
serde_json::to_writer(&mut json_buffer, &field)?;
|
serde_json::to_writer(&mut json_buffer, &field)?;
|
||||||
writer.insert(*field_id, &json_buffer)?;
|
writer.insert(*field_id, &json_buffer)?;
|
||||||
|
|
||||||
|
let field_name = fields_ids_map.name(*field_id).unwrap();
|
||||||
|
|
||||||
|
*fields_distribution.entry(field_name.to_string()).or_default() += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
@ -328,6 +340,7 @@ impl Transform<'_, '_> {
|
|||||||
sorter,
|
sorter,
|
||||||
primary_key_name,
|
primary_key_name,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
documents_count,
|
documents_count,
|
||||||
external_documents_ids,
|
external_documents_ids,
|
||||||
progress_callback,
|
progress_callback,
|
||||||
@ -342,6 +355,7 @@ impl Transform<'_, '_> {
|
|||||||
sorter: grenad::Sorter<MergeFn>,
|
sorter: grenad::Sorter<MergeFn>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
fields_ids_map: FieldsIdsMap,
|
fields_ids_map: FieldsIdsMap,
|
||||||
|
fields_distribution: FieldsDistribution,
|
||||||
approximate_number_of_documents: usize,
|
approximate_number_of_documents: usize,
|
||||||
mut external_documents_ids: ExternalDocumentsIds<'_>,
|
mut external_documents_ids: ExternalDocumentsIds<'_>,
|
||||||
progress_callback: F,
|
progress_callback: F,
|
||||||
@ -439,6 +453,7 @@ impl Transform<'_, '_> {
|
|||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
external_documents_ids: external_documents_ids.into_static(),
|
external_documents_ids: external_documents_ids.into_static(),
|
||||||
new_documents_ids,
|
new_documents_ids,
|
||||||
replaced_documents_ids,
|
replaced_documents_ids,
|
||||||
@ -457,6 +472,7 @@ impl Transform<'_, '_> {
|
|||||||
new_fields_ids_map: FieldsIdsMap,
|
new_fields_ids_map: FieldsIdsMap,
|
||||||
) -> anyhow::Result<TransformOutput>
|
) -> anyhow::Result<TransformOutput>
|
||||||
{
|
{
|
||||||
|
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
|
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
|
||||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||||
let documents_count = documents_ids.len() as usize;
|
let documents_count = documents_ids.len() as usize;
|
||||||
@ -492,6 +508,7 @@ impl Transform<'_, '_> {
|
|||||||
Ok(TransformOutput {
|
Ok(TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map: new_fields_ids_map,
|
fields_ids_map: new_fields_ids_map,
|
||||||
|
fields_distribution,
|
||||||
external_documents_ids: external_documents_ids.into_static(),
|
external_documents_ids: external_documents_ids.into_static(),
|
||||||
new_documents_ids: documents_ids,
|
new_documents_ids: documents_ids,
|
||||||
replaced_documents_ids: RoaringBitmap::default(),
|
replaced_documents_ids: RoaringBitmap::default(),
|
||||||
|
@ -183,7 +183,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Udpates the index's searchable attributes. This causes the field map to be recomputed to
|
/// Updates the index's searchable attributes. This causes the field map to be recomputed to
|
||||||
/// reflect the order of the searchable attributes.
|
/// reflect the order of the searchable attributes.
|
||||||
fn update_searchable(&mut self) -> anyhow::Result<bool> {
|
fn update_searchable(&mut self) -> anyhow::Result<bool> {
|
||||||
match self.searchable_fields {
|
match self.searchable_fields {
|
||||||
|
Loading…
Reference in New Issue
Block a user