mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Add a database containing the docids where each field exists
This commit is contained in:
parent
5704235521
commit
453d593ce8
@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_exists_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
field_id_docid_facet_f64s: _,
|
field_id_docid_facet_f64s: _,
|
||||||
@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
let field_id_word_count_docids_name = "field_id_word_count_docids";
|
let field_id_word_count_docids_name = "field_id_word_count_docids";
|
||||||
let facet_id_f64_docids_name = "facet_id_f64_docids";
|
let facet_id_f64_docids_name = "facet_id_f64_docids";
|
||||||
let facet_id_string_docids_name = "facet_id_string_docids";
|
let facet_id_string_docids_name = "facet_id_string_docids";
|
||||||
|
let facet_id_exists_docids_name = "facet_id_exists_docids";
|
||||||
let documents_name = "documents";
|
let documents_name = "documents";
|
||||||
|
|
||||||
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
let mut heap = BinaryHeap::with_capacity(limit + 1);
|
||||||
@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
heap.pop();
|
heap.pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// List the docids where the facet exists
|
||||||
|
let db = facet_id_exists_docids.remap_data_type::<ByteSlice>();
|
||||||
|
for result in facet_values_iter(rtxn, db, facet_id)? {
|
||||||
|
let (_fid, value) = result?;
|
||||||
|
let key = format!("{}", facet_name);
|
||||||
|
heap.push(Reverse((value.len(), key, facet_id_exists_docids_name)));
|
||||||
|
if heap.len() > limit {
|
||||||
|
heap.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for result in index.all_documents(rtxn)? {
|
for result in index.all_documents(rtxn)? {
|
||||||
@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
|||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
|
facet_id_exists_docids,
|
||||||
exact_word_prefix_docids,
|
exact_word_prefix_docids,
|
||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
..
|
..
|
||||||
@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
|||||||
FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(),
|
FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(),
|
||||||
FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(),
|
FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(),
|
||||||
FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
|
FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
|
||||||
|
FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(),
|
||||||
FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
|
FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
|
||||||
FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
|
FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
|
||||||
EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
|
EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
|
||||||
|
@ -25,3 +25,43 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
pub struct FieldIdCodec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldIdCodec {
|
||||||
|
type DItem = FieldId;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id_bytes, _) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
|
Some(field_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for FieldIdCodec {
|
||||||
|
type EItem = FieldId;
|
||||||
|
|
||||||
|
fn bytes_encode(field_id: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
Some(Cow::Owned(field_id.to_be_bytes().to_vec()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FieldIdDocIdCodec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec {
|
||||||
|
type DItem = (FieldId, DocumentId);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||||
|
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
|
let document_id_bytes = bytes[..4].try_into().ok()?;
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
|
Some((field_id, document_id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -15,7 +15,7 @@ use crate::error::{InternalError, UserError};
|
|||||||
use crate::fields_ids_map::FieldsIdsMap;
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||||
@ -75,6 +75,7 @@ pub mod db_name {
|
|||||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||||
|
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
@ -116,6 +117,9 @@ pub struct Index {
|
|||||||
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
||||||
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
|
/// Maps the facet field id and the docids for which this field exists
|
||||||
|
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
||||||
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
||||||
@ -134,7 +138,7 @@ impl Index {
|
|||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(16);
|
options.max_dbs(17);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -152,6 +156,9 @@ impl Index {
|
|||||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||||
|
let facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec> =
|
||||||
|
env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||||
|
|
||||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||||
let field_id_docid_facet_strings =
|
let field_id_docid_facet_strings =
|
||||||
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
|
||||||
@ -174,6 +181,7 @@ impl Index {
|
|||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_exists_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
documents,
|
documents,
|
||||||
@ -806,6 +814,18 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Retrieve all the documents which contain this field id
|
||||||
|
pub fn exists_faceted_documents_ids(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
field_id: FieldId,
|
||||||
|
) -> heed::Result<RoaringBitmap> {
|
||||||
|
match self.facet_id_exists_docids.get(rtxn, &field_id)? {
|
||||||
|
Some(docids) => Ok(docids),
|
||||||
|
None => Ok(RoaringBitmap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* distinct field */
|
/* distinct field */
|
||||||
|
|
||||||
pub(crate) fn put_distinct_field(
|
pub(crate) fn put_distinct_field(
|
||||||
|
@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_exists_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
documents,
|
documents,
|
||||||
@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
field_id_word_count_docids.clear(self.wtxn)?;
|
field_id_word_count_docids.clear(self.wtxn)?;
|
||||||
word_prefix_position_docids.clear(self.wtxn)?;
|
word_prefix_position_docids.clear(self.wtxn)?;
|
||||||
facet_id_f64_docids.clear(self.wtxn)?;
|
facet_id_f64_docids.clear(self.wtxn)?;
|
||||||
|
facet_id_exists_docids.clear(self.wtxn)?;
|
||||||
facet_id_string_docids.clear(self.wtxn)?;
|
facet_id_string_docids.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||||
|
@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
|
facet_id_exists_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
remove_docids_from_facet_field_id_number_docids(
|
remove_docids_from_facet_field_id_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
&self.to_delete_docids,
|
&self.to_delete_docids,
|
||||||
)?;
|
)?;
|
||||||
|
// We delete the documents ids that are under the facet field id values.
|
||||||
|
remove_docids_from_facet_field_id_docids(
|
||||||
|
self.wtxn,
|
||||||
|
facet_id_exists_docids,
|
||||||
|
&self.to_delete_docids,
|
||||||
|
)?;
|
||||||
|
|
||||||
remove_docids_from_facet_field_id_string_docids(
|
remove_docids_from_facet_field_id_string_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_docids_from_facet_field_id_number_docids<'a, C>(
|
fn remove_docids_from_facet_field_id_docids<'a, C>(
|
||||||
wtxn: &'a mut heed::RwTxn,
|
wtxn: &'a mut heed::RwTxn,
|
||||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
|
@ -0,0 +1,42 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
|
use super::helpers::{
|
||||||
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||||
|
};
|
||||||
|
use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec};
|
||||||
|
use crate::Result;
|
||||||
|
|
||||||
|
/// Extracts the documents ids where this field appears.
|
||||||
|
///
|
||||||
|
/// Returns a grenad reader whose key is the field id encoded
|
||||||
|
/// with `FieldIdCodec` and the value is a document_id (u32)
|
||||||
|
/// encoded as native-endian bytes.
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn extract_facet_exists_docids<R: io::Read + io::Seek>(
|
||||||
|
docid_fid_facet_number: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let mut facet_exists_docids_sorter = create_sorter(
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
||||||
|
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
||||||
|
let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap();
|
||||||
|
|
||||||
|
let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap();
|
||||||
|
|
||||||
|
facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
sorter_into_reader(facet_exists_docids_sorter, indexer)
|
||||||
|
}
|
@ -20,7 +20,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
obkv_documents: grenad::Reader<R>,
|
obkv_documents: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||||
@ -28,7 +28,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||||
@ -36,7 +36,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / 3),
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut fid_docid_facet_exists_sorter = create_sorter(
|
||||||
|
keep_first,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|m| m / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
@ -46,15 +54,19 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
if faceted_fields.contains(&field_id) {
|
if faceted_fields.contains(&field_id) {
|
||||||
let value =
|
|
||||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
|
||||||
let (numbers, strings) = extract_facet_values(&value);
|
|
||||||
|
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
|
|
||||||
|
// here, we know already that the document must be added to the “field id exists” database
|
||||||
// prefix key with the field_id and the document_id
|
// prefix key with the field_id and the document_id
|
||||||
|
|
||||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
key_buffer.extend_from_slice(&docid_bytes);
|
key_buffer.extend_from_slice(&docid_bytes);
|
||||||
|
fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?;
|
||||||
|
|
||||||
|
let value =
|
||||||
|
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
|
|
||||||
|
let (numbers, strings) = extract_facet_values(&value);
|
||||||
|
|
||||||
// insert facet numbers in sorter
|
// insert facet numbers in sorter
|
||||||
for number in numbers {
|
for number in numbers {
|
||||||
@ -79,7 +91,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
|
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
|
||||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
|
||||||
|
sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
mod extract_docid_word_positions;
|
mod extract_docid_word_positions;
|
||||||
|
mod extract_facet_exists_docids;
|
||||||
mod extract_facet_number_docids;
|
mod extract_facet_number_docids;
|
||||||
mod extract_facet_string_docids;
|
mod extract_facet_string_docids;
|
||||||
mod extract_fid_docid_facet_values;
|
mod extract_fid_docid_facet_values;
|
||||||
@ -16,6 +17,7 @@ use log::debug;
|
|||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||||
|
use self::extract_facet_exists_docids::extract_facet_exists_docids;
|
||||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
||||||
@ -53,7 +55,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
})
|
})
|
||||||
.collect::<Result<()>>()?;
|
.collect::<Result<()>>()?;
|
||||||
|
|
||||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks
|
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|flattened_obkv_chunks| {
|
.map(|flattened_obkv_chunks| {
|
||||||
send_and_extract_flattened_documents_data(
|
send_and_extract_flattened_documents_data(
|
||||||
@ -72,7 +74,10 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
|
|
||||||
let (
|
let (
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
(
|
||||||
|
docid_fid_facet_numbers_chunks,
|
||||||
|
(docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks),
|
||||||
|
),
|
||||||
) = result?;
|
) = result?;
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
@ -137,6 +142,15 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
TypedChunk::FieldIdFacetNumberDocids,
|
TypedChunk::FieldIdFacetNumberDocids,
|
||||||
"field-id-facet-number-docids",
|
"field-id-facet-number-docids",
|
||||||
);
|
);
|
||||||
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
|
docid_fid_facet_exists_chunks.clone(),
|
||||||
|
indexer.clone(),
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
extract_facet_exists_docids,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
TypedChunk::FieldIdFacetExistsDocids,
|
||||||
|
"field-id-facet-exists-docids",
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -197,6 +211,7 @@ fn send_original_documents_data(
|
|||||||
/// - docid_word_positions
|
/// - docid_word_positions
|
||||||
/// - docid_fid_facet_numbers
|
/// - docid_fid_facet_numbers
|
||||||
/// - docid_fid_facet_strings
|
/// - docid_fid_facet_strings
|
||||||
|
/// - docid_fid_facet_exists
|
||||||
fn send_and_extract_flattened_documents_data(
|
fn send_and_extract_flattened_documents_data(
|
||||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
@ -208,8 +223,11 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
|
grenad::Reader<CursorClonableMmap>,
|
||||||
|
(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||||
|
),
|
||||||
)> {
|
)> {
|
||||||
let flattened_documents_chunk =
|
let flattened_documents_chunk =
|
||||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
@ -250,8 +268,11 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
Ok(docid_word_positions_chunk)
|
Ok(docid_word_positions_chunk)
|
||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
|
let (
|
||||||
extract_fid_docid_facet_values(
|
docid_fid_facet_numbers_chunk,
|
||||||
|
docid_fid_facet_strings_chunk,
|
||||||
|
docid_fid_facet_exists_chunk,
|
||||||
|
) = extract_fid_docid_facet_values(
|
||||||
flattened_documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
faceted_fields,
|
faceted_fields,
|
||||||
@ -273,7 +294,13 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
docid_fid_facet_strings_chunk.clone(),
|
docid_fid_facet_strings_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk))
|
let docid_fid_facet_exists_chunk =
|
||||||
|
unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? };
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
docid_fid_facet_numbers_chunk,
|
||||||
|
(docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk),
|
||||||
|
))
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -1931,4 +1931,153 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(ids.len(), map.len());
|
assert_eq!(ids.len(), map.len());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_documents_check_exists_database_reindex() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"colour": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"colour": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"colour": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"colour": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"colour": [1]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"colour": {
|
||||||
|
"green": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
let faceted_fields = hashset!(S("colour"));
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
|
||||||
|
|
||||||
|
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||||
|
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||||
|
|
||||||
|
let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
|
||||||
|
assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
|
||||||
|
|
||||||
|
let bitmap_colour_green =
|
||||||
|
index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
|
||||||
|
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_documents_check_exists_database() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
let faceted_fields = hashset!(S("colour"));
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let content = documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"colour": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"colour": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"colour": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"colour": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"colour": [1]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"colour": {
|
||||||
|
"green": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let mut builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
|
||||||
|
.unwrap();
|
||||||
|
builder.add_documents(content).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||||
|
assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
|
||||||
|
|
||||||
|
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||||
|
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||||
|
|
||||||
|
let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
|
||||||
|
assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
|
||||||
|
|
||||||
|
let bitmap_colour_green =
|
||||||
|
index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
|
||||||
|
assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
|
|||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||||
|
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<File>),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,6 +147,18 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
|
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => {
|
||||||
|
append_entries_into_database(
|
||||||
|
facet_id_exists_docids_iter,
|
||||||
|
&index.facet_id_exists_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
is_merged_database = true;
|
||||||
|
}
|
||||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_pair_proximity_docids_iter,
|
word_pair_proximity_docids_iter,
|
||||||
|
Loading…
Reference in New Issue
Block a user