mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Make the attribute positions range bounds to be fixed
This commit is contained in:
parent
658f316511
commit
7aa5753ed2
@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
|||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::fs::{create_dir_all, File};
|
use std::fs::{create_dir_all, File};
|
||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::num::NonZeroUsize;
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@ -286,8 +286,8 @@ struct WordsPrefixes {
|
|||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct WordsLevelPositions {
|
struct WordsLevelPositions {
|
||||||
level_group_size: Option<NonZeroUsize>,
|
level_group_size: Option<NonZeroU32>,
|
||||||
min_level_size: Option<NonZeroUsize>,
|
min_level_size: Option<NonZeroU32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Any value that is present is considered Some value, including null.
|
// Any value that is present is considered Some value, including null.
|
||||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Seek, SeekFrom};
|
use std::io::{self, Seek, SeekFrom};
|
||||||
use std::num::NonZeroUsize;
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
use std::sync::mpsc::sync_channel;
|
use std::sync::mpsc::sync_channel;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
facet_min_level_size: Option<NonZeroUsize>,
|
facet_min_level_size: Option<NonZeroUsize>,
|
||||||
words_prefix_threshold: Option<f64>,
|
words_prefix_threshold: Option<f64>,
|
||||||
max_prefix_length: Option<usize>,
|
max_prefix_length: Option<usize>,
|
||||||
words_positions_level_group_size: Option<NonZeroUsize>,
|
words_positions_level_group_size: Option<NonZeroU32>,
|
||||||
words_positions_min_level_size: Option<NonZeroUsize>,
|
words_positions_min_level_size: Option<NonZeroU32>,
|
||||||
update_method: IndexDocumentsMethod,
|
update_method: IndexDocumentsMethod,
|
||||||
update_format: UpdateFormat,
|
update_format: UpdateFormat,
|
||||||
autogenerate_docids: bool,
|
autogenerate_docids: bool,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroU32;
|
||||||
|
|
||||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||||
use heed::types::{DecodeIgnore, Str};
|
use heed::types::{DecodeIgnore, Str};
|
||||||
@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
level_group_size: NonZeroUsize,
|
level_group_size: NonZeroU32,
|
||||||
min_level_size: NonZeroUsize,
|
min_level_size: NonZeroU32,
|
||||||
_update_id: u64,
|
_update_id: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
chunk_compression_type: CompressionType::None,
|
chunk_compression_type: CompressionType::None,
|
||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
chunk_fusing_shrink_size: None,
|
chunk_fusing_shrink_size: None,
|
||||||
level_group_size: NonZeroUsize::new(4).unwrap(),
|
level_group_size: NonZeroU32::new(4).unwrap(),
|
||||||
min_level_size: NonZeroUsize::new(5).unwrap(),
|
min_level_size: NonZeroU32::new(5).unwrap(),
|
||||||
_update_id: update_id,
|
_update_id: update_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
|
pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self {
|
||||||
self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
|
self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap();
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
|
pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self {
|
||||||
self.min_level_size = value;
|
self.min_level_size = value;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the next number after or equal to `x` that is divisible by `d`.
|
||||||
|
fn next_divisible(x: u32, d: u32) -> u32 {
|
||||||
|
(x.saturating_sub(1) | (d - 1)) + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the previous number after or equal to `x` that is divisible by `d`,
|
||||||
|
/// saturates on zero.
|
||||||
|
fn previous_divisible(x: u32, d: u32) -> u32 {
|
||||||
|
match x.checked_sub(d - 1) {
|
||||||
|
Some(0) | None => 0,
|
||||||
|
Some(x) => next_divisible(x, d),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Generates all the words positions levels based on the levels zero (including the level zero).
|
/// Generates all the words positions levels based on the levels zero (including the level zero).
|
||||||
fn compute_positions_levels(
|
fn compute_positions_levels(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
@ -92,8 +106,8 @@ fn compute_positions_levels(
|
|||||||
compression_type: CompressionType,
|
compression_type: CompressionType,
|
||||||
compression_level: Option<u32>,
|
compression_level: Option<u32>,
|
||||||
shrink_size: Option<u64>,
|
shrink_size: Option<u64>,
|
||||||
level_group_size: NonZeroUsize,
|
level_group_size: NonZeroU32,
|
||||||
min_level_size: NonZeroUsize,
|
min_level_size: NonZeroU32,
|
||||||
) -> anyhow::Result<Reader<FileFuse>>
|
) -> anyhow::Result<Reader<FileFuse>>
|
||||||
{
|
{
|
||||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||||
@ -113,7 +127,7 @@ fn compute_positions_levels(
|
|||||||
|
|
||||||
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
||||||
.range(rtxn, &level_0_range)?
|
.range(rtxn, &level_0_range)?
|
||||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
|
||||||
|
|
||||||
// Groups sizes are always a power of the original level_group_size and therefore a group
|
// Groups sizes are always a power of the original level_group_size and therefore a group
|
||||||
// always maps groups of the previous level and never splits previous levels groups in half.
|
// always maps groups of the previous level and never splits previous levels groups in half.
|
||||||
@ -136,20 +150,23 @@ fn compute_positions_levels(
|
|||||||
let ((_word, _level, value, _right), docids) = result?;
|
let ((_word, _level, value, _right), docids) = result?;
|
||||||
|
|
||||||
if i == 0 {
|
if i == 0 {
|
||||||
left = value;
|
left = previous_divisible(value, group_size);
|
||||||
} else if i % group_size == 0 {
|
right = left + (group_size - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if value > right {
|
||||||
// we found the first bound of the next group, we must store the left
|
// we found the first bound of the next group, we must store the left
|
||||||
// and right bounds associated with the docids.
|
// and right bounds associated with the docids.
|
||||||
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
|
write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
|
||||||
|
|
||||||
// We save the left bound for the new group and also reset the docids.
|
// We save the left bound for the new group and also reset the docids.
|
||||||
group_docids = RoaringBitmap::new();
|
group_docids = RoaringBitmap::new();
|
||||||
left = value;
|
left = previous_divisible(value, group_size);
|
||||||
|
right = left + (group_size - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The right bound is always the bound we run through.
|
// The right bound is always the bound we run through.
|
||||||
group_docids.union_with(&docids);
|
group_docids.union_with(&docids);
|
||||||
right = value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !group_docids.is_empty() {
|
if !group_docids.is_empty() {
|
||||||
|
Loading…
Reference in New Issue
Block a user