Make the attribute positions range bounds to be fixed

This commit is contained in:
Clément Renault 2021-03-24 15:06:54 +01:00 committed by many
parent 658f316511
commit 7aa5753ed2
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 38 additions and 21 deletions

View File

@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fmt::Display; use std::fmt::Display;
use std::fs::{create_dir_all, File}; use std::fs::{create_dir_all, File};
use std::net::SocketAddr; use std::net::SocketAddr;
use std::num::NonZeroUsize; use std::num::{NonZeroU32, NonZeroUsize};
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
@ -286,8 +286,8 @@ struct WordsPrefixes {
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct WordsLevelPositions { struct WordsLevelPositions {
level_group_size: Option<NonZeroUsize>, level_group_size: Option<NonZeroU32>,
min_level_size: Option<NonZeroUsize>, min_level_size: Option<NonZeroU32>,
} }
// Any value that is present is considered Some value, including null. // Any value that is present is considered Some value, including null.

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::{self, Seek, SeekFrom}; use std::io::{self, Seek, SeekFrom};
use std::num::NonZeroUsize; use std::num::{NonZeroU32, NonZeroUsize};
use std::sync::mpsc::sync_channel; use std::sync::mpsc::sync_channel;
use std::time::Instant; use std::time::Instant;
@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
facet_min_level_size: Option<NonZeroUsize>, facet_min_level_size: Option<NonZeroUsize>,
words_prefix_threshold: Option<f64>, words_prefix_threshold: Option<f64>,
max_prefix_length: Option<usize>, max_prefix_length: Option<usize>,
words_positions_level_group_size: Option<NonZeroUsize>, words_positions_level_group_size: Option<NonZeroU32>,
words_positions_min_level_size: Option<NonZeroUsize>, words_positions_min_level_size: Option<NonZeroU32>,
update_method: IndexDocumentsMethod, update_method: IndexDocumentsMethod,
update_format: UpdateFormat, update_format: UpdateFormat,
autogenerate_docids: bool, autogenerate_docids: bool,

View File

@ -1,7 +1,7 @@
use std::cmp; use std::cmp;
use std::convert::TryFrom; use std::convert::TryFrom;
use std::fs::File; use std::fs::File;
use std::num::NonZeroUsize; use std::num::NonZeroU32;
use grenad::{CompressionType, Reader, Writer, FileFuse}; use grenad::{CompressionType, Reader, Writer, FileFuse};
use heed::types::{DecodeIgnore, Str}; use heed::types::{DecodeIgnore, Str};
@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) chunk_fusing_shrink_size: Option<u64>, pub(crate) chunk_fusing_shrink_size: Option<u64>,
level_group_size: NonZeroUsize, level_group_size: NonZeroU32,
min_level_size: NonZeroUsize, min_level_size: NonZeroU32,
_update_id: u64, _update_id: u64,
} }
@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
chunk_compression_type: CompressionType::None, chunk_compression_type: CompressionType::None,
chunk_compression_level: None, chunk_compression_level: None,
chunk_fusing_shrink_size: None, chunk_fusing_shrink_size: None,
level_group_size: NonZeroUsize::new(4).unwrap(), level_group_size: NonZeroU32::new(4).unwrap(),
min_level_size: NonZeroUsize::new(5).unwrap(), min_level_size: NonZeroU32::new(5).unwrap(),
_update_id: update_id, _update_id: update_id,
} }
} }
pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self {
self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap();
self self
} }
pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self {
self.min_level_size = value; self.min_level_size = value;
self self
} }
@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
} }
} }
/// Returns the next number after or equal to `x` that is divisible by `d`.
fn next_divisible(x: u32, d: u32) -> u32 {
(x.saturating_sub(1) | (d - 1)) + 1
}
/// Returns the previous number after or equal to `x` that is divisible by `d`,
/// saturates on zero.
fn previous_divisible(x: u32, d: u32) -> u32 {
match x.checked_sub(d - 1) {
Some(0) | None => 0,
Some(x) => next_divisible(x, d),
}
}
/// Generates all the words positions levels based on the levels zero (including the level zero). /// Generates all the words positions levels based on the levels zero (including the level zero).
fn compute_positions_levels( fn compute_positions_levels(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
@ -92,8 +106,8 @@ fn compute_positions_levels(
compression_type: CompressionType, compression_type: CompressionType,
compression_level: Option<u32>, compression_level: Option<u32>,
shrink_size: Option<u64>, shrink_size: Option<u64>,
level_group_size: NonZeroUsize, level_group_size: NonZeroU32,
min_level_size: NonZeroUsize, min_level_size: NonZeroU32,
) -> anyhow::Result<Reader<FileFuse>> ) -> anyhow::Result<Reader<FileFuse>>
{ {
// It is forbidden to keep a cursor and write in a database at the same time with LMDB // It is forbidden to keep a cursor and write in a database at the same time with LMDB
@ -113,7 +127,7 @@ fn compute_positions_levels(
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>() let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)? .range(rtxn, &level_0_range)?
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
// Groups sizes are always a power of the original level_group_size and therefore a group // Groups sizes are always a power of the original level_group_size and therefore a group
// always maps groups of the previous level and never splits previous levels groups in half. // always maps groups of the previous level and never splits previous levels groups in half.
@ -136,20 +150,23 @@ fn compute_positions_levels(
let ((_word, _level, value, _right), docids) = result?; let ((_word, _level, value, _right), docids) = result?;
if i == 0 { if i == 0 {
left = value; left = previous_divisible(value, group_size);
} else if i % group_size == 0 { right = left + (group_size - 1);
}
if value > right {
// we found the first bound of the next group, we must store the left // we found the first bound of the next group, we must store the left
// and right bounds associated with the docids. // and right bounds associated with the docids.
write_level_entry(&mut writer, word, level, left, right, &group_docids)?; write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
// We save the left bound for the new group and also reset the docids. // We save the left bound for the new group and also reset the docids.
group_docids = RoaringBitmap::new(); group_docids = RoaringBitmap::new();
left = value; left = previous_divisible(value, group_size);
right = left + (group_size - 1);
} }
// The right bound is always the bound we run through. // The right bound is always the bound we run through.
group_docids.union_with(&docids); group_docids.union_with(&docids);
right = value;
} }
if !group_docids.is_empty() { if !group_docids.is_empty() {