Make the facet levels maps to previous level groups and don't split them

This commit is contained in:
Clément Renault 2020-11-28 12:43:43 +01:00
parent 276c87af68
commit ba4ba685f9
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 41 additions and 125 deletions

View File

@ -28,7 +28,7 @@ use warp::{Filter, http::Response};
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat, EasingName};
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition};
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
@ -237,9 +237,8 @@ struct Settings {
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
struct Facets {
last_level_size: Option<NonZeroUsize>,
number_of_levels: Option<NonZeroUsize>,
easing_function: Option<String>,
level_group_size: Option<NonZeroUsize>,
min_level_size: Option<NonZeroUsize>,
}
// Any value that is present is considered Some value, including null.
@ -415,27 +414,12 @@ async fn main() -> anyhow::Result<()> {
// We must use the write transaction of the update here.
let mut wtxn = index_cloned.write_txn()?;
let mut builder = update_builder.facets(&mut wtxn, &index_cloned);
if let Some(value) = levels.last_level_size {
builder.last_level_size(value);
if let Some(value) = levels.level_group_size {
builder.level_group_size(value);
}
if let Some(value) = levels.number_of_levels {
builder.number_of_levels(value);
if let Some(value) = levels.min_level_size {
builder.min_level_size(value);
}
if let Some(value) = levels.easing_function {
let easing_name = if value.eq_ignore_ascii_case("expo") {
EasingName::Expo
} else if value.eq_ignore_ascii_case("quart") {
EasingName::Quart
} else if value.eq_ignore_ascii_case("circ") {
EasingName::Circ
} else if value.eq_ignore_ascii_case("linear") {
EasingName::Linear
} else {
panic!("Invalid easing function name")
};
builder.easing_function(easing_name);
}
match builder.execute() {
Ok(()) => wtxn.commit().map_err(Into::into),
Err(e) => Err(e.into())
@ -804,7 +788,7 @@ async fn main() -> anyhow::Result<()> {
let update_store_cloned = update_store.clone();
let update_status_sender_cloned = update_status_sender.clone();
let change_facet_levels_route = warp::filters::method::post()
.and(warp::path!("facet-levels"))
.and(warp::path!("facet-level-sizes"))
.and(warp::body::json())
.map(move |levels: Facets| {
let meta = UpdateMeta::Facets(levels);

View File

@ -1,10 +1,10 @@
use std::cmp;
use std::fs::File;
use std::num::NonZeroUsize;
use grenad::{CompressionType, Reader, Writer, FileFuse};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error};
use itertools::Itertools;
use log::debug;
use num_traits::{Bounded, Zero};
use roaring::RoaringBitmap;
@ -16,23 +16,14 @@ use crate::Index;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
#[derive(Debug, Copy, Clone)]
pub enum EasingName {
Expo,
Quart,
Circ,
Linear,
}
pub struct Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
number_of_levels: NonZeroUsize,
last_level_size: NonZeroUsize,
easing_function: EasingName,
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
}
impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
@ -43,24 +34,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
number_of_levels: NonZeroUsize::new(5).unwrap(),
last_level_size: NonZeroUsize::new(5).unwrap(),
easing_function: EasingName::Expo,
level_group_size: NonZeroUsize::new(4).unwrap(),
min_level_size: NonZeroUsize::new(5).unwrap(),
}
}
pub fn number_of_levels(&mut self, value: NonZeroUsize) -> &mut Self {
self.number_of_levels = value;
pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
self
}
pub fn last_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
self.last_level_size = value;
self
}
pub fn easing_function(&mut self, value: EasingName) -> &mut Self {
self.easing_function = value;
pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
self.min_level_size = value;
self
}
@ -90,9 +75,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.last_level_size,
self.number_of_levels,
self.easing_function,
self.level_group_size,
self.min_level_size,
field_id,
)?;
@ -117,9 +101,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
self.chunk_compression_type,
self.chunk_compression_level,
self.chunk_fusing_shrink_size,
self.last_level_size,
self.number_of_levels,
self.easing_function,
self.level_group_size,
self.min_level_size,
field_id,
)?;
@ -175,9 +158,8 @@ fn compute_facet_levels<'t, T: 't, KC>(
compression_type: CompressionType,
compression_level: Option<u32>,
shrink_size: Option<u64>,
last_level_size: NonZeroUsize,
number_of_levels: NonZeroUsize,
easing_function: EasingName,
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
field_id: u8,
) -> anyhow::Result<Reader<FileFuse>>
where
@ -201,15 +183,13 @@ where
left..=right
};
let level_sizes_iter =
levels_iterator(first_level_size, last_level_size.get(), number_of_levels.get(), easing_function)
.map(|size| (first_level_size as f64 / size as f64).ceil() as usize)
.unique()
.enumerate()
.skip(1);
// Groups sizes are always a power of the original level_group_size and therefore a group
// always maps groups of the previous level and never splits previous levels groups in half.
let group_size_iter = (1u8..)
.map(|l| (l, level_group_size.get().pow(l as u32)))
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
// TODO we must not create levels with identical group sizes.
for (level, level_entry_sizes) in level_sizes_iter {
for (level, group_size) in group_size_iter {
let mut left = T::zero();
let mut right = T::zero();
let mut group_docids = RoaringBitmap::new();
@ -220,10 +200,10 @@ where
if i == 0 {
left = value;
} else if i % level_entry_sizes == 0 {
} else if i % group_size == 0 {
// we found the first bound of the next group, we must store the left
// and right bounds associated with the docids.
write_entry::<T, KC>(&mut writer, field_id, level as u8, left, right, &group_docids)?;
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
// We save the left bound for the new group and also reset the docids.
group_docids = RoaringBitmap::new();
@ -236,7 +216,7 @@ where
}
if !group_docids.is_empty() {
write_entry::<T, KC>(&mut writer, field_id, level as u8, left, right, &group_docids)?;
write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
}
}
@ -274,51 +254,3 @@ where
writer.insert(&key, &data)?;
Ok(())
}
fn levels_iterator(
first_level_size: usize, // biggest level
last_level_size: usize, // smallest level
number_of_levels: usize,
easing_function: EasingName,
) -> impl Iterator<Item=usize>
{
let easing_function = match easing_function {
EasingName::Expo => ease_out_expo,
EasingName::Quart => ease_out_quart,
EasingName::Circ => ease_out_circ,
EasingName::Linear => ease_out_linear,
};
let b = last_level_size as f64;
let end = first_level_size as f64;
let c = end - b;
let d = number_of_levels;
(0..=d).map(move |t| ((end + b) - easing_function(t as f64, b, c, d as f64)) as usize)
}
// Go look at the function definitions here:
// https://docs.rs/easer/0.2.1/easer/index.html
// https://easings.net/#easeOutExpo
fn ease_out_expo(t: f64, b: f64, c: f64, d: f64) -> f64 {
if t == d {
b + c
} else {
c * (-2.0_f64.powf(-10.0 * t / d) + 1.0) + b
}
}
// https://easings.net/#easeOutCirc
fn ease_out_circ(t: f64, b: f64, c: f64, d: f64) -> f64 {
let t = t / d - 1.0;
c * (1.0 - t * t).sqrt() + b
}
// https://easings.net/#easeOutQuart
fn ease_out_quart(t: f64, b: f64, c: f64, d: f64) -> f64 {
let t = t / d - 1.0;
-c * ((t * t * t * t) - 1.0) + b
}
fn ease_out_linear(t: f64, b: f64, c: f64, d: f64) -> f64 {
c * t / d + b
}

View File

@ -208,8 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>,
pub(crate) thread_pool: Option<&'a ThreadPool>,
facet_number_of_levels: Option<NonZeroUsize>,
facet_last_level_size: Option<NonZeroUsize>,
facet_level_group_size: Option<NonZeroUsize>,
facet_min_level_size: Option<NonZeroUsize>,
update_method: IndexDocumentsMethod,
update_format: UpdateFormat,
autogenerate_docids: bool,
@ -228,8 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
chunk_compression_level: None,
chunk_fusing_shrink_size: None,
thread_pool: None,
facet_number_of_levels: None,
facet_last_level_size: None,
facet_level_group_size: None,
facet_min_level_size: None,
update_method: IndexDocumentsMethod::ReplaceDocuments,
update_format: UpdateFormat::Json,
autogenerate_docids: true,
@ -588,11 +588,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
if let Some(value) = self.facet_number_of_levels {
builder.number_of_levels(value);
if let Some(value) = self.facet_level_group_size {
builder.level_group_size(value);
}
if let Some(value) = self.facet_last_level_size {
builder.last_level_size(value);
if let Some(value) = self.facet_min_level_size {
builder.min_level_size(value);
}
builder.execute()?;

View File

@ -12,7 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
pub use self::facets::{Facets, EasingName};
pub use self::facets::Facets;
pub use self::settings::Settings;
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;