mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Fix and improve the words-level-positions computation
This commit is contained in:
parent
6b1b42b928
commit
3069bf4f4a
@ -558,7 +558,9 @@ fn words_level_positions_docids(
|
|||||||
left..=right
|
left..=right
|
||||||
};
|
};
|
||||||
for result in index.word_level_position_docids.range(rtxn, &range)? {
|
for result in index.word_level_position_docids.range(rtxn, &range)? {
|
||||||
let ((word, level, left, right), docids) = result?;
|
let ((w, level, left, right), docids) = result?;
|
||||||
|
if word != w { break }
|
||||||
|
|
||||||
let level = level.to_string();
|
let level = level.to_string();
|
||||||
let count = docids.len().to_string();
|
let count = docids.len().to_string();
|
||||||
let docids = if debug {
|
let docids = if debug {
|
||||||
@ -567,7 +569,7 @@ fn words_level_positions_docids(
|
|||||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
};
|
};
|
||||||
let position_range = format!("{:?}", left..=right);
|
let position_range = format!("{:?}", left..=right);
|
||||||
wtr.write_record(&[word, &level, &position_range, &count, &docids])?;
|
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -388,7 +388,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
key_buffer.push(0); // level 0
|
key_buffer.push(0); // level 0
|
||||||
|
|
||||||
for position in positions {
|
for position in positions {
|
||||||
key_buffer.truncate(word.len());
|
key_buffer.truncate(word.len() + 1);
|
||||||
let position_bytes = position.to_be_bytes();
|
let position_bytes = position.to_be_bytes();
|
||||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||||
|
@ -3,7 +3,7 @@ use std::fs::File;
|
|||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::{DecodeIgnore, Str};
|
||||||
use heed::{BytesEncode, Error};
|
use heed::{BytesEncode, Error};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -56,10 +56,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
pub fn execute(self) -> anyhow::Result<()> {
|
pub fn execute(self) -> anyhow::Result<()> {
|
||||||
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
|
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
|
||||||
|
|
||||||
clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?;
|
|
||||||
|
|
||||||
let entries = compute_positions_levels(
|
let entries = compute_positions_levels(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
|
self.index.word_docids.remap_data_type::<DecodeIgnore>(),
|
||||||
self.index.word_level_position_docids,
|
self.index.word_level_position_docids,
|
||||||
self.chunk_compression_type,
|
self.chunk_compression_type,
|
||||||
self.chunk_compression_level,
|
self.chunk_compression_level,
|
||||||
@ -74,7 +73,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
|
|
||||||
write_into_lmdb_database(
|
write_into_lmdb_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.facet_field_id_value_docids.as_polymorph(),
|
*self.index.word_level_position_docids.as_polymorph(),
|
||||||
entries,
|
entries,
|
||||||
|_, _| anyhow::bail!("invalid facet level merging"),
|
|_, _| anyhow::bail!("invalid facet level merging"),
|
||||||
WriteMethod::Append,
|
WriteMethod::Append,
|
||||||
@ -84,25 +83,11 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn clear_non_zero_levels_positions(
|
/// Generates all the words positions levels based on the levels zero (including the level zero).
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
|
||||||
) -> heed::Result<()>
|
|
||||||
{
|
|
||||||
let mut iter = db.iter_mut(wtxn)?.lazily_decode_data();
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let ((_, level, _, _), _) = result?;
|
|
||||||
if level != 0 {
|
|
||||||
iter.del_current()?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generates all the words positions levels (including the level zero).
|
|
||||||
fn compute_positions_levels(
|
fn compute_positions_levels(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
words_db: heed::Database<Str, DecodeIgnore>,
|
||||||
|
words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||||
compression_type: CompressionType,
|
compression_type: CompressionType,
|
||||||
compression_level: Option<u32>,
|
compression_level: Option<u32>,
|
||||||
shrink_size: Option<u64>,
|
shrink_size: Option<u64>,
|
||||||
@ -116,11 +101,11 @@ fn compute_positions_levels(
|
|||||||
create_writer(compression_type, compression_level, file)
|
create_writer(compression_type, compression_level, file)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
for result in db.iter(rtxn)? {
|
for result in words_db.iter(rtxn)? {
|
||||||
let ((word, level, left, right), docids) = result?;
|
let (word, ()) = result?;
|
||||||
|
|
||||||
let first_level_size = db.remap_data_type::<DecodeIgnore>()
|
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
||||||
.prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))?
|
.prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))?
|
||||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||||
|
|
||||||
let level_0_range = {
|
let level_0_range = {
|
||||||
@ -136,14 +121,17 @@ fn compute_positions_levels(
|
|||||||
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
|
.take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
|
||||||
|
|
||||||
// As specified in the documentation, we also write the level 0 entries.
|
// As specified in the documentation, we also write the level 0 entries.
|
||||||
write_level_entry(&mut writer, word, level, left, right, &docids)?;
|
for result in words_positions_db.range(rtxn, &level_0_range)? {
|
||||||
|
let ((word, level, left, right), docids) = result?;
|
||||||
|
write_level_entry(&mut writer, word, level, left, right, &docids)?;
|
||||||
|
}
|
||||||
|
|
||||||
for (level, group_size) in group_size_iter {
|
for (level, group_size) in group_size_iter {
|
||||||
let mut left = 0;
|
let mut left = 0;
|
||||||
let mut right = 0;
|
let mut right = 0;
|
||||||
let mut group_docids = RoaringBitmap::new();
|
let mut group_docids = RoaringBitmap::new();
|
||||||
|
|
||||||
for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
|
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
|
||||||
let ((_field_id, _level, value, _right), docids) = result?;
|
let ((_field_id, _level, value, _right), docids) = result?;
|
||||||
|
|
||||||
if i == 0 {
|
if i == 0 {
|
||||||
|
Loading…
Reference in New Issue
Block a user