From 7aa5753ed282afd2df90f1fae07beb2a1b8eeb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Mar 2021 15:06:54 +0100 Subject: [PATCH] Make the attribute positions range bounds to be fixed --- http-ui/src/main.rs | 6 +-- milli/src/update/index_documents/mod.rs | 6 +-- milli/src/update/words_level_positions.rs | 47 +++++++++++++++-------- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index dbf7aadce..c85bd9b15 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; use std::net::SocketAddr; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -286,8 +286,8 @@ struct WordsPrefixes { #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct WordsLevelPositions { - level_group_size: Option, - min_level_size: Option, + level_group_size: Option, + min_level_size: Option, } // Any value that is present is considered Some value, including null. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3a41a52ae..7a2196481 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, - words_positions_level_group_size: Option, - words_positions_min_level_size: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 4286fc780..eb8d3bb3c 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,7 +1,7 @@ use std::cmp; use std::convert::TryFrom; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::NonZeroU32; use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{DecodeIgnore, Str}; @@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, _update_id: u64, } @@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), _update_id: update_id, } } - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); self } - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { self.min_level_size = value; self } @@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } +/// Returns the next number after or equal to `x` that is divisible by `d`. +fn next_divisible(x: u32, d: u32) -> u32 { + (x.saturating_sub(1) | (d - 1)) + 1 +} + +/// Returns the previous number after or equal to `x` that is divisible by `d`, +/// saturates on zero. +fn previous_divisible(x: u32, d: u32) -> u32 { + match x.checked_sub(d - 1) { + Some(0) | None => 0, + Some(x) => next_divisible(x, d), + } +} + /// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, @@ -92,8 +106,8 @@ fn compute_positions_levels( compression_type: CompressionType, compression_level: Option, shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, ) -> anyhow::Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB @@ -113,7 +127,7 @@ fn compute_positions_levels( let first_level_size = words_positions_db.remap_data_type::() .range(rtxn, &level_0_range)? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -136,20 +150,23 @@ fn compute_positions_levels( let ((_word, _level, value, _right), docids) = result?; if i == 0 { - left = value; - } else if i % group_size == 0 { + left = previous_divisible(value, group_size); + right = left + (group_size - 1); + } + + if value > right { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. write_level_entry(&mut writer, word, level, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); - left = value; + left = previous_divisible(value, group_size); + right = left + (group_size - 1); } // The right bound is always the bound we run through. group_docids.union_with(&docids); - right = value; } if !group_docids.is_empty() {