mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-10-30 01:38:49 +01:00
Finish prefix databases
This commit is contained in:
parent
a2fbf2ea21
commit
d675e73af1
@ -1669,6 +1669,14 @@ impl Index {
|
|||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result<PrefixSettings> {
|
||||||
|
Ok(PrefixSettings {
|
||||||
|
compute_prefixes: true,
|
||||||
|
max_prefix_length: 4,
|
||||||
|
prefix_count_threshold: 100,
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
@ -1678,6 +1686,13 @@ pub struct IndexEmbeddingConfig {
|
|||||||
pub user_provided: RoaringBitmap,
|
pub user_provided: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
pub struct PrefixSettings {
|
||||||
|
pub prefix_count_threshold: u64,
|
||||||
|
pub max_prefix_length: usize,
|
||||||
|
pub compute_prefixes: bool,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime);
|
struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime);
|
||||||
|
@ -29,6 +29,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::new::channel::ExtractorSender;
|
use crate::update::new::channel::ExtractorSender;
|
||||||
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
||||||
use crate::update::settings::InnerIndexSettings;
|
use crate::update::settings::InnerIndexSettings;
|
||||||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
||||||
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||||
@ -301,6 +302,8 @@ fn compute_prefix_database(
|
|||||||
let PrefixDelta { modified, deleted } = prefix_delta;
|
let PrefixDelta { modified, deleted } = prefix_delta;
|
||||||
// Compute word prefix docids
|
// Compute word prefix docids
|
||||||
compute_word_prefix_docids(wtxn, index, &modified, &deleted)?;
|
compute_word_prefix_docids(wtxn, index, &modified, &deleted)?;
|
||||||
|
// Compute exact word prefix docids
|
||||||
|
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted)?;
|
||||||
// Compute word prefix fid docids
|
// Compute word prefix fid docids
|
||||||
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?;
|
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted)?;
|
||||||
// Compute word prefix position docids
|
// Compute word prefix position docids
|
||||||
|
@ -10,7 +10,7 @@ use roaring::RoaringBitmap;
|
|||||||
|
|
||||||
use super::channel::*;
|
use super::channel::*;
|
||||||
use super::extract::FacetKind;
|
use super::extract::FacetKind;
|
||||||
use super::word_fst_builder::{PrefixData, PrefixDelta, PrefixSettings};
|
use super::word_fst_builder::{PrefixData, PrefixDelta};
|
||||||
use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId};
|
use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId};
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::update::new::channel::MergerOperation;
|
use crate::update::new::channel::MergerOperation;
|
||||||
@ -63,12 +63,7 @@ pub fn merge_grenad_entries(
|
|||||||
MergerOperation::WordDocidsMerger(merger) => {
|
MergerOperation::WordDocidsMerger(merger) => {
|
||||||
let words_fst = index.words_fst(rtxn)?;
|
let words_fst = index.words_fst(rtxn)?;
|
||||||
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
|
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
|
||||||
/// TODO make this configurable
|
let prefix_settings = index.prefix_settings(rtxn)?;
|
||||||
let prefix_settings = PrefixSettings {
|
|
||||||
compute_prefixes: true,
|
|
||||||
max_prefix_length: 4,
|
|
||||||
prefix_count_threshold: 100,
|
|
||||||
};
|
|
||||||
word_fst_builder.with_prefix_settings(prefix_settings);
|
word_fst_builder.with_prefix_settings(prefix_settings);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -5,7 +5,7 @@ use memmap2::Mmap;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use tempfile::tempfile;
|
use tempfile::tempfile;
|
||||||
|
|
||||||
use crate::{update::del_add::DelAdd, Prefix, Result};
|
use crate::{index::PrefixSettings, update::del_add::DelAdd, InternalError, Prefix, Result};
|
||||||
|
|
||||||
pub struct WordFstBuilder<'a> {
|
pub struct WordFstBuilder<'a> {
|
||||||
stream: Option<fst::set::Stream<'a>>,
|
stream: Option<fst::set::Stream<'a>>,
|
||||||
@ -143,8 +143,10 @@ impl<'a> WordFstBuilder<'a> {
|
|||||||
) -> Result<(Mmap, Option<PrefixData>)> {
|
) -> Result<(Mmap, Option<PrefixData>)> {
|
||||||
self.drain_stream()?;
|
self.drain_stream()?;
|
||||||
|
|
||||||
/// TODO: ugly unwrap
|
let words_fst_file =
|
||||||
let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap();
|
self.word_fst_builder.into_inner()?.into_inner().map_err(|_| {
|
||||||
|
InternalError::IndexingMergingKeys { process: "building-words-fst" }
|
||||||
|
})?;
|
||||||
let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
|
let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
|
||||||
|
|
||||||
let prefix_data = self
|
let prefix_data = self
|
||||||
@ -156,13 +158,6 @@ impl<'a> WordFstBuilder<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct PrefixSettings {
|
|
||||||
pub prefix_count_threshold: u64,
|
|
||||||
pub max_prefix_length: usize,
|
|
||||||
pub compute_prefixes: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct PrefixData {
|
pub struct PrefixData {
|
||||||
pub prefixes_fst_mmap: Mmap,
|
pub prefixes_fst_mmap: Mmap,
|
||||||
pub prefix_delta: PrefixDelta,
|
pub prefix_delta: PrefixDelta,
|
||||||
@ -269,8 +264,9 @@ impl PrefixFstBuilder {
|
|||||||
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
||||||
let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?;
|
let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?;
|
||||||
builder.extend_stream(op.r#union())?;
|
builder.extend_stream(op.r#union())?;
|
||||||
/// TODO: ugly unwrap
|
let prefix_fst_file = builder.into_inner()?.into_inner().map_err(|_| {
|
||||||
let prefix_fst_file = builder.into_inner()?.into_inner().unwrap();
|
InternalError::IndexingMergingKeys { process: "building-words-prefixes-fst" }
|
||||||
|
})?;
|
||||||
let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
|
let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
|
||||||
let new_prefix_fst = Set::new(&prefix_fst_mmap)?;
|
let new_prefix_fst = Set::new(&prefix_fst_mmap)?;
|
||||||
let old_prefix_fst = index.words_prefixes_fst(rtxn)?;
|
let old_prefix_fst = index.words_prefixes_fst(rtxn)?;
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use heed::Database;
|
use hashbrown::HashMap;
|
||||||
use heed::{types::Bytes, RwTxn};
|
use heed::{types::Bytes, RwTxn};
|
||||||
|
use heed::{BytesDecode, Database};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::heed_codec::StrBEU16Codec;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
|
use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
|
||||||
|
|
||||||
struct WordPrefixDocids {
|
struct WordPrefixDocids {
|
||||||
@ -25,23 +27,10 @@ impl WordPrefixDocids {
|
|||||||
prefix_to_compute: &HashSet<Prefix>,
|
prefix_to_compute: &HashSet<Prefix>,
|
||||||
prefix_to_delete: &HashSet<Prefix>,
|
prefix_to_delete: &HashSet<Prefix>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.delete_prefixes(wtxn, prefix_to_delete)?;
|
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
|
||||||
fn delete_prefixes(&self, wtxn: &mut heed::RwTxn, prefixes: &HashSet<Prefix>) -> Result<()> {
|
|
||||||
// We remove all the entries that are no more required in this word prefix docids database.
|
|
||||||
for prefix in prefixes {
|
|
||||||
let prefix = prefix.as_bytes();
|
|
||||||
if !self.prefix_database.delete(wtxn, prefix)? {
|
|
||||||
unreachable!("We tried to delete an unknown key")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
fn recompute_modified_prefixes(
|
fn recompute_modified_prefixes(
|
||||||
&self,
|
&self,
|
||||||
@ -65,6 +54,89 @@ impl WordPrefixDocids {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct WordPrefixIntegerDocids {
|
||||||
|
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||||
|
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WordPrefixIntegerDocids {
|
||||||
|
fn new(
|
||||||
|
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||||
|
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||||
|
) -> WordPrefixIntegerDocids {
|
||||||
|
WordPrefixIntegerDocids { database, prefix_database }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute(
|
||||||
|
self,
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
|
prefix_to_compute: &HashSet<Prefix>,
|
||||||
|
prefix_to_delete: &HashSet<Prefix>,
|
||||||
|
) -> Result<()> {
|
||||||
|
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||||
|
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
|
fn recompute_modified_prefixes(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
prefixes: &HashSet<Prefix>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// We fetch the docids associated to the newly added word prefix fst only.
|
||||||
|
// We use a HashMap to store the docids associated to each position, may be RAM consuming.
|
||||||
|
let mut integer_docids = HashMap::new();
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
for prefix in prefixes {
|
||||||
|
let prefix = prefix.as_bytes();
|
||||||
|
for result in self.database.prefix_iter(wtxn, prefix)? {
|
||||||
|
let (key, data) = result?;
|
||||||
|
let (_word, pos) =
|
||||||
|
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||||
|
|
||||||
|
match integer_docids.get_mut(&pos) {
|
||||||
|
Some(docids) => {
|
||||||
|
*docids |= &data;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
integer_docids.insert(pos, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (pos, docids) in integer_docids.iter_mut() {
|
||||||
|
if !docids.is_empty() {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(prefix);
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&pos.to_be_bytes());
|
||||||
|
self.prefix_database.put(wtxn, &key_buffer, &docids)?;
|
||||||
|
}
|
||||||
|
docids.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
|
fn delete_prefixes(
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
prefix_database: &Database<Bytes, CboRoaringBitmapCodec>,
|
||||||
|
prefixes: &HashSet<Prefix>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// We remove all the entries that are no more required in this word prefix docids database.
|
||||||
|
for prefix in prefixes {
|
||||||
|
let prefix = prefix.as_bytes();
|
||||||
|
if !prefix_database.delete(wtxn, prefix)? {
|
||||||
|
unreachable!("We tried to delete an unknown key")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
pub fn compute_word_prefix_docids(
|
pub fn compute_word_prefix_docids(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
@ -80,13 +152,27 @@ pub fn compute_word_prefix_docids(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
pub fn compute_word_prefix_fid_docids(
|
pub fn compute_exact_word_prefix_docids(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
prefix_to_compute: &HashSet<Prefix>,
|
prefix_to_compute: &HashSet<Prefix>,
|
||||||
prefix_to_delete: &HashSet<Prefix>,
|
prefix_to_delete: &HashSet<Prefix>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
WordPrefixDocids::new(
|
WordPrefixDocids::new(
|
||||||
|
index.exact_word_docids.remap_key_type(),
|
||||||
|
index.exact_word_prefix_docids.remap_key_type(),
|
||||||
|
)
|
||||||
|
.execute(wtxn, prefix_to_compute, prefix_to_delete)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
|
pub fn compute_word_prefix_fid_docids(
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
index: &Index,
|
||||||
|
prefix_to_compute: &HashSet<Prefix>,
|
||||||
|
prefix_to_delete: &HashSet<Prefix>,
|
||||||
|
) -> Result<()> {
|
||||||
|
WordPrefixIntegerDocids::new(
|
||||||
index.word_fid_docids.remap_key_type(),
|
index.word_fid_docids.remap_key_type(),
|
||||||
index.word_prefix_fid_docids.remap_key_type(),
|
index.word_prefix_fid_docids.remap_key_type(),
|
||||||
)
|
)
|
||||||
@ -100,7 +186,7 @@ pub fn compute_word_prefix_position_docids(
|
|||||||
prefix_to_compute: &HashSet<Prefix>,
|
prefix_to_compute: &HashSet<Prefix>,
|
||||||
prefix_to_delete: &HashSet<Prefix>,
|
prefix_to_delete: &HashSet<Prefix>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
WordPrefixDocids::new(
|
WordPrefixIntegerDocids::new(
|
||||||
index.word_position_docids.remap_key_type(),
|
index.word_position_docids.remap_key_type(),
|
||||||
index.word_prefix_position_docids.remap_key_type(),
|
index.word_prefix_position_docids.remap_key_type(),
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user