mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Introduce the field-id-docid-facet-values database
This commit is contained in:
parent
4ffbddf21f
commit
3cdf14d4c5
36
src/heed_codec/facet/field_doc_id_facet_f64_codec.rs
Normal file
36
src/heed_codec/facet/field_doc_id_facet_f64_codec.rs
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use crate::{FieldId, DocumentId};
|
||||||
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
|
|
||||||
|
pub struct FieldDocIdFacetF64Codec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
|
||||||
|
type DItem = (FieldId, DocumentId, f64);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id, bytes) = bytes.split_first()?;
|
||||||
|
|
||||||
|
let (document_id_bytes, bytes) = bytes.split_at(4);
|
||||||
|
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||||
|
|
||||||
|
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
|
||||||
|
|
||||||
|
Some((*field_id, document_id, value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
|
||||||
|
type EItem = (FieldId, DocumentId, f64);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8);
|
||||||
|
bytes.push(*field_id);
|
||||||
|
bytes.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
let value_bytes = f64_into_bytes(*value)?;
|
||||||
|
bytes.extend_from_slice(&value_bytes);
|
||||||
|
bytes.extend_from_slice(&value.to_be_bytes());
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
34
src/heed_codec/facet/field_doc_id_facet_i64_codec.rs
Normal file
34
src/heed_codec/facet/field_doc_id_facet_i64_codec.rs
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use crate::facet::value_encoding::{i64_into_bytes, i64_from_bytes};
|
||||||
|
use crate::{FieldId, DocumentId};
|
||||||
|
|
||||||
|
pub struct FieldDocIdFacetI64Codec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetI64Codec {
|
||||||
|
type DItem = (FieldId, DocumentId, i64);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id, bytes) = bytes.split_first()?;
|
||||||
|
|
||||||
|
let (document_id_bytes, bytes) = bytes.split_at(4);
|
||||||
|
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||||
|
|
||||||
|
let value = bytes[..8].try_into().map(i64_from_bytes).ok()?;
|
||||||
|
|
||||||
|
Some((*field_id, document_id, value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetI64Codec {
|
||||||
|
type EItem = (FieldId, DocumentId, i64);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut bytes = Vec::with_capacity(1 + 4 + 8);
|
||||||
|
bytes.push(*field_id);
|
||||||
|
bytes.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
bytes.extend_from_slice(&i64_into_bytes(*value));
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
31
src/heed_codec/facet/field_doc_id_facet_string_codec.rs
Normal file
31
src/heed_codec/facet/field_doc_id_facet_string_codec.rs
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use crate::{FieldId, DocumentId};
|
||||||
|
|
||||||
|
pub struct FieldDocIdFacetStringCodec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
|
||||||
|
type DItem = (FieldId, DocumentId, &'a str);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id, bytes) = bytes.split_first()?;
|
||||||
|
let (document_id_bytes, bytes) = bytes.split_at(4);
|
||||||
|
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||||
|
let value = str::from_utf8(bytes).ok()?;
|
||||||
|
Some((*field_id, document_id, value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec {
|
||||||
|
type EItem = (FieldId, DocumentId, &'a str);
|
||||||
|
|
||||||
|
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut bytes = Vec::with_capacity(1 + 4 + value.len());
|
||||||
|
bytes.push(*field_id);
|
||||||
|
bytes.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
bytes.extend_from_slice(value.as_bytes());
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,13 @@
|
|||||||
mod facet_level_value_f64_codec;
|
mod facet_level_value_f64_codec;
|
||||||
mod facet_level_value_i64_codec;
|
mod facet_level_value_i64_codec;
|
||||||
mod facet_value_string_codec;
|
mod facet_value_string_codec;
|
||||||
|
mod field_doc_id_facet_f64_codec;
|
||||||
|
mod field_doc_id_facet_i64_codec;
|
||||||
|
mod field_doc_id_facet_string_codec;
|
||||||
|
|
||||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||||
pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec;
|
pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec;
|
||||||
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
||||||
|
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||||
|
pub use self::field_doc_id_facet_i64_codec::FieldDocIdFacetI64Codec;
|
||||||
|
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||||
|
@ -42,13 +42,15 @@ pub struct Index {
|
|||||||
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
|
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
|
||||||
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
|
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the document id, the facet field id and the globally ordered value.
|
||||||
|
pub field_id_docid_facet_values: Database<ByteSlice, Unit>,
|
||||||
/// Maps the document id to the document as an obkv store.
|
/// Maps the document id to the document as an obkv store.
|
||||||
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
||||||
options.max_dbs(6);
|
options.max_dbs(7);
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let main = env.create_poly_database(Some("main"))?;
|
let main = env.create_poly_database(Some("main"))?;
|
||||||
@ -56,6 +58,7 @@ impl Index {
|
|||||||
let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
|
let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
|
||||||
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
||||||
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
|
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
|
||||||
|
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
|
||||||
let documents = env.create_database(Some("documents"))?;
|
let documents = env.create_database(Some("documents"))?;
|
||||||
|
|
||||||
Ok(Index {
|
Ok(Index {
|
||||||
@ -65,6 +68,7 @@ impl Index {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -297,6 +297,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
|
field_id_docid_facet_values: _,
|
||||||
documents,
|
documents,
|
||||||
} = index;
|
} = index;
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
|
|
||||||
@ -41,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
facet_field_id_value_docids.clear(self.wtxn)?;
|
facet_field_id_value_docids.clear(self.wtxn)?;
|
||||||
|
field_id_docid_facet_values.clear(self.wtxn)?;
|
||||||
documents.clear(self.wtxn)?;
|
documents.clear(self.wtxn)?;
|
||||||
|
|
||||||
Ok(number_of_documents)
|
Ok(number_of_documents)
|
||||||
|
@ -2,7 +2,9 @@ use fst::IntoStreamer;
|
|||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::facet::FacetType;
|
||||||
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
|
use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds};
|
||||||
|
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||||
use super::ClearDocuments;
|
use super::ClearDocuments;
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
@ -75,6 +77,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
|
|
||||||
@ -186,10 +189,42 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// Remove the documents ids from the faceted documents ids.
|
// Remove the documents ids from the faceted documents ids.
|
||||||
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
let faceted_fields = self.index.faceted_fields(self.wtxn)?;
|
||||||
for (field_id, _) in faceted_fields {
|
for (field_id, facet_type) in faceted_fields {
|
||||||
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
|
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
|
||||||
docids.difference_with(&self.documents_ids);
|
docids.difference_with(&self.documents_ids);
|
||||||
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
|
self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?;
|
||||||
|
|
||||||
|
// We delete the entries that are part of the documents ids.
|
||||||
|
let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?;
|
||||||
|
match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
let mut iter = iter.remap_key_type::<FieldDocIdFacetStringCodec>();
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let ((_fid, docid, _value), ()) = result?;
|
||||||
|
if self.documents_ids.contains(docid) {
|
||||||
|
iter.del_current()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
FacetType::Float => {
|
||||||
|
let mut iter = iter.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let ((_fid, docid, _value), ()) = result?;
|
||||||
|
if self.documents_ids.contains(docid) {
|
||||||
|
iter.del_current()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
FacetType::Integer => {
|
||||||
|
let mut iter = iter.remap_key_type::<FieldDocIdFacetI64Codec>();
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let ((_fid, docid, _value), ()) = result?;
|
||||||
|
if self.documents_ids.contains(docid) {
|
||||||
|
iter.del_current()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
@ -205,6 +240,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
drop(iter);
|
||||||
|
|
||||||
Ok(self.documents_ids.len() as usize)
|
Ok(self.documents_ids.len() as usize)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use anyhow::{bail, ensure};
|
use anyhow::{bail, ensure, Context};
|
||||||
use bstr::ByteSlice as _;
|
use bstr::ByteSlice as _;
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -42,6 +42,12 @@ pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::
|
|||||||
bail!("merging docid word positions is an error ({:?})", key.as_bstr())
|
bail!("merging docid word positions is an error ({:?})", key.as_bstr())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let first = values.first().context("no value to merge")?;
|
||||||
|
ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging");
|
||||||
|
Ok(first.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
cbo_roaring_bitmap_merge(values)
|
cbo_roaring_bitmap_merge(values)
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,7 @@ use self::store::{Store, Readers};
|
|||||||
use self::merge_function::{
|
use self::merge_function::{
|
||||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||||
docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
|
docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
|
||||||
|
field_id_docid_facet_values_merge,
|
||||||
};
|
};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
|
|
||||||
@ -395,6 +396,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
||||||
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
||||||
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
|
let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
|
||||||
|
let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len());
|
||||||
let mut documents_readers = Vec::with_capacity(readers.len());
|
let mut documents_readers = Vec::with_capacity(readers.len());
|
||||||
readers.into_iter().for_each(|readers| {
|
readers.into_iter().for_each(|readers| {
|
||||||
let Readers {
|
let Readers {
|
||||||
@ -403,6 +405,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
words_pairs_proximities_docids,
|
words_pairs_proximities_docids,
|
||||||
facet_field_value_docids,
|
facet_field_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
documents
|
documents
|
||||||
} = readers;
|
} = readers;
|
||||||
main_readers.push(main);
|
main_readers.push(main);
|
||||||
@ -410,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions_readers.push(docid_word_positions);
|
docid_word_positions_readers.push(docid_word_positions);
|
||||||
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
||||||
facet_field_value_docids_readers.push(facet_field_value_docids);
|
facet_field_value_docids_readers.push(facet_field_value_docids);
|
||||||
|
field_id_docid_facet_values_readers.push(field_id_docid_facet_values);
|
||||||
documents_readers.push(documents);
|
documents_readers.push(documents);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -453,6 +457,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions_readers,
|
docid_word_positions_readers,
|
||||||
documents_readers,
|
documents_readers,
|
||||||
words_pairs_proximities_docids_readers,
|
words_pairs_proximities_docids_readers,
|
||||||
|
field_id_docid_facet_values_readers,
|
||||||
)) as anyhow::Result<_>
|
)) as anyhow::Result<_>
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@ -461,6 +466,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
docid_word_positions_readers,
|
docid_word_positions_readers,
|
||||||
documents_readers,
|
documents_readers,
|
||||||
words_pairs_proximities_docids_readers,
|
words_pairs_proximities_docids_readers,
|
||||||
|
field_id_docid_facet_values_readers,
|
||||||
) = readers;
|
) = readers;
|
||||||
|
|
||||||
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
|
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||||
@ -488,7 +494,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
||||||
|
|
||||||
let mut database_count = 0;
|
let mut database_count = 0;
|
||||||
let total_databases = 6;
|
let total_databases = 7;
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
databases_seen: 0,
|
databases_seen: 0,
|
||||||
@ -525,6 +531,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
total_databases,
|
total_databases,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
debug!("Writing the field id docid facet values into LMDB on disk...");
|
||||||
|
merge_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.field_id_docid_facet_values.as_polymorph(),
|
||||||
|
field_id_docid_facet_values_readers,
|
||||||
|
field_id_docid_facet_values_merge,
|
||||||
|
write_method,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
database_count += 1;
|
||||||
|
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
|
databases_seen: database_count,
|
||||||
|
total_databases,
|
||||||
|
});
|
||||||
|
|
||||||
debug!("Writing the words pairs proximities docids into LMDB on disk...");
|
debug!("Writing the words pairs proximities docids into LMDB on disk...");
|
||||||
merge_into_lmdb_database(
|
merge_into_lmdb_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
|
@ -20,6 +20,7 @@ use tempfile::tempfile;
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
|
||||||
|
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
|
||||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||||
use crate::update::UpdateIndexingStep;
|
use crate::update::UpdateIndexingStep;
|
||||||
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
|
use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};
|
||||||
@ -27,7 +28,7 @@ use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, Docu
|
|||||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||||
use super::merge_function::{
|
use super::merge_function::{
|
||||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||||
facet_field_value_docids_merge,
|
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
||||||
};
|
};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
@ -42,6 +43,7 @@ pub struct Readers {
|
|||||||
pub docid_word_positions: Reader<FileFuse>,
|
pub docid_word_positions: Reader<FileFuse>,
|
||||||
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
||||||
pub facet_field_value_docids: Reader<FileFuse>,
|
pub facet_field_value_docids: Reader<FileFuse>,
|
||||||
|
pub field_id_docid_facet_values: Reader<FileFuse>,
|
||||||
pub documents: Reader<FileFuse>,
|
pub documents: Reader<FileFuse>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,6 +67,7 @@ pub struct Store {
|
|||||||
word_docids_sorter: Sorter<MergeFn>,
|
word_docids_sorter: Sorter<MergeFn>,
|
||||||
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
|
||||||
facet_field_value_docids_sorter: Sorter<MergeFn>,
|
facet_field_value_docids_sorter: Sorter<MergeFn>,
|
||||||
|
field_id_docid_facet_values_sorter: Sorter<MergeFn>,
|
||||||
// MTBL writers
|
// MTBL writers
|
||||||
docid_word_positions_writer: Writer<File>,
|
docid_word_positions_writer: Writer<File>,
|
||||||
documents_writer: Writer<File>,
|
documents_writer: Writer<File>,
|
||||||
@ -118,6 +121,14 @@ impl Store {
|
|||||||
max_nb_chunks,
|
max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
let field_id_docid_facet_values_sorter = create_sorter(
|
||||||
|
field_id_docid_facet_values_merge,
|
||||||
|
chunk_compression_type,
|
||||||
|
chunk_compression_level,
|
||||||
|
chunk_fusing_shrink_size,
|
||||||
|
max_nb_chunks,
|
||||||
|
Some(1024 * 1024 * 1024), // 1MB
|
||||||
|
);
|
||||||
|
|
||||||
let documents_writer = tempfile().and_then(|f| {
|
let documents_writer = tempfile().and_then(|f| {
|
||||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||||
@ -146,6 +157,7 @@ impl Store {
|
|||||||
word_docids_sorter,
|
word_docids_sorter,
|
||||||
words_pairs_proximities_docids_sorter,
|
words_pairs_proximities_docids_sorter,
|
||||||
facet_field_value_docids_sorter,
|
facet_field_value_docids_sorter,
|
||||||
|
field_id_docid_facet_values_sorter,
|
||||||
// MTBL writers
|
// MTBL writers
|
||||||
docid_word_positions_writer,
|
docid_word_positions_writer,
|
||||||
documents_writer,
|
documents_writer,
|
||||||
@ -181,6 +193,8 @@ impl Store {
|
|||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
{
|
{
|
||||||
|
Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?;
|
||||||
|
|
||||||
let key = (field_id, field_value);
|
let key = (field_id, field_value);
|
||||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||||
match self.facet_field_value_docids.get_refresh(&key) {
|
match self.facet_field_value_docids.get_refresh(&key) {
|
||||||
@ -192,7 +206,7 @@ impl Store {
|
|||||||
// one element, this way next time we insert we doesn't grow the capacity.
|
// one element, this way next time we insert we doesn't grow the capacity.
|
||||||
if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit {
|
if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit {
|
||||||
// Removing the front element is equivalent to removing the LRU element.
|
// Removing the front element is equivalent to removing the LRU element.
|
||||||
Self::write_docid_facet_field_values(
|
Self::write_facet_field_value_docids(
|
||||||
&mut self.facet_field_value_docids_sorter,
|
&mut self.facet_field_value_docids_sorter,
|
||||||
self.facet_field_value_docids.pop_front(),
|
self.facet_field_value_docids.pop_front(),
|
||||||
)?;
|
)?;
|
||||||
@ -326,11 +340,11 @@ impl Store {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_docid_facet_field_values<I>(
|
fn write_facet_field_value_docids<I>(
|
||||||
sorter: &mut Sorter<MergeFn>,
|
sorter: &mut Sorter<MergeFn>,
|
||||||
iter: I,
|
iter: I,
|
||||||
) -> anyhow::Result<()>
|
) -> anyhow::Result<()>
|
||||||
where I: IntoIterator<Item=((u8, FacetValue), RoaringBitmap)>
|
where I: IntoIterator<Item=((FieldId, FacetValue), RoaringBitmap)>
|
||||||
{
|
{
|
||||||
use FacetValue::*;
|
use FacetValue::*;
|
||||||
|
|
||||||
@ -351,6 +365,29 @@ impl Store {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_field_id_docid_facet_value(
|
||||||
|
sorter: &mut Sorter<MergeFn>,
|
||||||
|
field_id: FieldId,
|
||||||
|
document_id: DocumentId,
|
||||||
|
value: &FacetValue,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
{
|
||||||
|
use FacetValue::*;
|
||||||
|
|
||||||
|
let result = match value {
|
||||||
|
String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned),
|
||||||
|
Float(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned),
|
||||||
|
Integer(i) => FieldDocIdFacetI64Codec::bytes_encode(&(field_id, document_id, *i)).map(Cow::into_owned),
|
||||||
|
};
|
||||||
|
|
||||||
|
let key = result.context("could not serialize facet key")?;
|
||||||
|
if lmdb_key_valid_size(&key) {
|
||||||
|
sorter.insert(&key, &[])?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn write_word_docids<I>(sorter: &mut Sorter<MergeFn>, iter: I) -> anyhow::Result<()>
|
fn write_word_docids<I>(sorter: &mut Sorter<MergeFn>, iter: I) -> anyhow::Result<()>
|
||||||
where I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>
|
where I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>
|
||||||
{
|
{
|
||||||
@ -463,7 +500,7 @@ impl Store {
|
|||||||
&mut self.words_pairs_proximities_docids_sorter,
|
&mut self.words_pairs_proximities_docids_sorter,
|
||||||
self.words_pairs_proximities_docids,
|
self.words_pairs_proximities_docids,
|
||||||
)?;
|
)?;
|
||||||
Self::write_docid_facet_field_values(
|
Self::write_facet_field_value_docids(
|
||||||
&mut self.facet_field_value_docids_sorter,
|
&mut self.facet_field_value_docids_sorter,
|
||||||
self.facet_field_value_docids,
|
self.facet_field_value_docids,
|
||||||
)?;
|
)?;
|
||||||
@ -491,10 +528,14 @@ impl Store {
|
|||||||
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
|
self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
|
||||||
|
|
||||||
|
let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||||
|
self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?;
|
||||||
|
|
||||||
let main = writer_into_reader(main_wtr, shrink_size)?;
|
let main = writer_into_reader(main_wtr, shrink_size)?;
|
||||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||||
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
|
let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
|
||||||
|
let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?;
|
||||||
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||||
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
|
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
|
||||||
|
|
||||||
@ -504,6 +545,7 @@ impl Store {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
words_pairs_proximities_docids,
|
words_pairs_proximities_docids,
|
||||||
facet_field_value_docids,
|
facet_field_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user