mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge #308
308: Implement a better parallel indexer r=Kerollmops a=ManyTheFish Rewrite the indexer: - enhance memory consumption control - optimize parallelism using rayon and crossbeam channel - factorize the different parts and make new DB implementation easier - optimize and fix prefix databases Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
commit
5cbe879325
@ -343,10 +343,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap());
|
||||
update_builder.log_every_n(indexer_opt_cloned.log_every_n);
|
||||
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
|
||||
update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size);
|
||||
update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type);
|
||||
update_builder
|
||||
.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes());
|
||||
|
||||
let before_update = Instant::now();
|
||||
// we extract the update type and execute the update itself.
|
||||
|
@ -207,6 +207,24 @@ enum Command {
|
||||
word2: String,
|
||||
},
|
||||
|
||||
/// Outputs a CSV with the proximities for the two specified words and
|
||||
/// the documents ids where these relations appears.
|
||||
///
|
||||
/// `word1`, `prefix` defines the word pair specified *in this specific order*.
|
||||
/// `proximity` defines the proximity between the two specified words.
|
||||
/// `documents_ids` defines the documents ids where the relation appears.
|
||||
WordPrefixPairProximitiesDocids {
|
||||
/// Display the whole documents ids in details.
|
||||
#[structopt(long)]
|
||||
full_display: bool,
|
||||
|
||||
/// First word of the word pair.
|
||||
word1: String,
|
||||
|
||||
/// Second word of the word pair.
|
||||
prefix: String,
|
||||
},
|
||||
|
||||
/// Outputs the words FST to standard output.
|
||||
///
|
||||
/// One can use the FST binary helper to dissect and analyze it,
|
||||
@ -282,6 +300,9 @@ fn main() -> anyhow::Result<()> {
|
||||
WordPairProximitiesDocids { full_display, word1, word2 } => {
|
||||
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
|
||||
}
|
||||
WordPrefixPairProximitiesDocids { full_display, word1, prefix } => {
|
||||
word_prefix_pair_proximities_docids(&index, &rtxn, !full_display, word1, prefix)
|
||||
}
|
||||
ExportWordsFst => export_words_fst(&index, &rtxn),
|
||||
ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn),
|
||||
ExportDocuments { internal_documents_ids } => {
|
||||
@ -1131,3 +1152,46 @@ fn word_pair_proximities_docids(
|
||||
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
||||
fn word_prefix_pair_proximities_docids(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
debug: bool,
|
||||
word1: String,
|
||||
word_prefix: String,
|
||||
) -> anyhow::Result<()> {
|
||||
use heed::types::ByteSlice;
|
||||
use milli::RoaringBitmapCodec;
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||
wtr.write_record(&["word1", "word_prefix", "proximity", "documents_ids"])?;
|
||||
|
||||
// Create the prefix key with only the pair of words.
|
||||
let mut prefix = Vec::with_capacity(word1.len() + word_prefix.len() + 1);
|
||||
prefix.extend_from_slice(word1.as_bytes());
|
||||
prefix.push(0);
|
||||
prefix.extend_from_slice(word_prefix.as_bytes());
|
||||
|
||||
let db = index.word_prefix_pair_proximity_docids.as_polymorph();
|
||||
let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
|
||||
for result in iter {
|
||||
let (key, docids) = result?;
|
||||
|
||||
// Skip keys that are longer than the requested one,
|
||||
// a longer key means that the second word is a prefix of the request word.
|
||||
if key.len() != prefix.len() + 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let proximity = key.last().unwrap();
|
||||
let docids = if debug {
|
||||
format!("{:?}", docids)
|
||||
} else {
|
||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||
};
|
||||
wtr.write_record(&[&word1, &word_prefix, &proximity.to_string(), &docids])?;
|
||||
}
|
||||
|
||||
Ok(wtr.flush()?)
|
||||
}
|
||||
|
@ -9,12 +9,13 @@ bstr = "0.2.15"
|
||||
byteorder = "1.4.2"
|
||||
chrono = { version = "0.4.19", features = ["serde"] }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.1"
|
||||
csv = "1.1.5"
|
||||
either = "1.6.1"
|
||||
flate2 = "1.0.20"
|
||||
fst = "0.4.5"
|
||||
fxhash = "0.2.1"
|
||||
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
|
||||
grenad = "0.3.0"
|
||||
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||
human_format = "1.0.3"
|
||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
|
@ -2,51 +2,64 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::{marker, str};
|
||||
|
||||
use super::try_split_at;
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
use crate::{try_split_array_at, try_split_at, Result};
|
||||
|
||||
/// A codec that encodes a string in front of the value.
|
||||
pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;
|
||||
|
||||
/// A codec that encodes a string in front of a value.
|
||||
///
|
||||
/// The usecase is for the facet string levels algorithm where we must know the
|
||||
/// original string of a normalized facet value, the original values are stored
|
||||
/// in the value to not break the lexicographical ordering of the LMDB keys.
|
||||
pub struct FacetStringLevelZeroValueCodec<C>(marker::PhantomData<C>);
|
||||
pub struct StringValueCodec<C>(marker::PhantomData<C>);
|
||||
|
||||
impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||
impl<'a, C> heed::BytesDecode<'a> for StringValueCodec<C>
|
||||
where
|
||||
C: heed::BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (&'a str, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (string_len, bytes) = try_split_at(bytes, 2)?;
|
||||
let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
|
||||
let (string, bytes) = try_split_at(bytes, string_len as usize)?;
|
||||
let string = str::from_utf8(string).ok()?;
|
||||
|
||||
let (string, bytes) = decode_prefix_string(bytes)?;
|
||||
C::bytes_decode(bytes).map(|item| (string, item))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||
impl<'a, C> heed::BytesEncode<'a> for StringValueCodec<C>
|
||||
where
|
||||
C: heed::BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (&'a str, C::EItem);
|
||||
|
||||
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let string_len: u16 = string.len().try_into().ok()?;
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
|
||||
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
|
||||
bytes.extend_from_slice(&string_len.to_be_bytes());
|
||||
bytes.extend_from_slice(string.as_bytes());
|
||||
encode_prefix_string(string, &mut bytes).ok()?;
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> {
|
||||
let (original_length_bytes, bytes) = try_split_array_at(value)?;
|
||||
let original_length = u16::from_be_bytes(original_length_bytes) as usize;
|
||||
let (string, bytes) = try_split_at(bytes, original_length)?;
|
||||
let string = str::from_utf8(string).ok()?;
|
||||
Some((string, bytes))
|
||||
}
|
||||
|
||||
pub fn encode_prefix_string(string: &str, buffer: &mut Vec<u8>) -> Result<()> {
|
||||
let string_len: u16 =
|
||||
string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
buffer.extend_from_slice(&string_len.to_be_bytes());
|
||||
buffer.extend_from_slice(string.as_bytes());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::types::Unit;
|
||||
@ -54,17 +67,15 @@ mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmaps() {
|
||||
let string = "abc";
|
||||
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
|
||||
let key = (string, docids.clone());
|
||||
let bytes =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap();
|
||||
let bytes = StringValueCodec::<RoaringBitmapCodec>::bytes_encode(&key).unwrap();
|
||||
let (out_string, out_docids) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
|
||||
StringValueCodec::<RoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_string, out_docids), (string, docids));
|
||||
}
|
||||
|
||||
@ -72,9 +83,8 @@ mod tests {
|
||||
fn deserialize_unit() {
|
||||
let string = "def";
|
||||
let key = (string, ());
|
||||
let bytes = FacetStringLevelZeroValueCodec::<Unit>::bytes_encode(&key).unwrap();
|
||||
let (out_string, out_unit) =
|
||||
FacetStringLevelZeroValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
|
||||
let bytes = StringValueCodec::<Unit>::bytes_encode(&key).unwrap();
|
||||
let (out_string, out_unit) = StringValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!((out_string, out_unit), (string, ()));
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,9 @@ mod field_doc_id_facet_string_codec;
|
||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
||||
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
||||
pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec;
|
||||
pub use self::facet_string_level_zero_value_codec::{
|
||||
decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec,
|
||||
};
|
||||
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
||||
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||
|
@ -52,6 +52,46 @@ impl CboRoaringBitmapCodec {
|
||||
RoaringBitmap::deserialize_from(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||
///
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||
/// values and is serialized in the buffer.
|
||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for bytes in slices {
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut reader = bytes.as_ref();
|
||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||
vec.push(integer);
|
||||
}
|
||||
} else {
|
||||
roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?;
|
||||
}
|
||||
}
|
||||
|
||||
if roaring.is_empty() {
|
||||
vec.sort_unstable();
|
||||
vec.dedup();
|
||||
|
||||
if vec.len() <= THRESHOLD {
|
||||
for integer in vec {
|
||||
buffer.extend_from_slice(&integer.to_ne_bytes());
|
||||
}
|
||||
} else {
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter());
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
} else {
|
||||
roaring.extend(vec);
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
@ -106,4 +146,40 @@ mod tests {
|
||||
|
||||
assert!(roaring_size > bo_size);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_cbo_roaring_bitmaps() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
let small_data = vec![
|
||||
RoaringBitmap::from_sorted_iter(1..4),
|
||||
RoaringBitmap::from_sorted_iter(2..5),
|
||||
RoaringBitmap::from_sorted_iter(4..6),
|
||||
RoaringBitmap::from_sorted_iter(1..3),
|
||||
];
|
||||
|
||||
let small_data: Vec<_> =
|
||||
small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap();
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(1..6);
|
||||
assert_eq!(bitmap, expected);
|
||||
|
||||
let medium_data = vec![
|
||||
RoaringBitmap::from_sorted_iter(1..4),
|
||||
RoaringBitmap::from_sorted_iter(2..5),
|
||||
RoaringBitmap::from_sorted_iter(4..8),
|
||||
RoaringBitmap::from_sorted_iter(0..3),
|
||||
RoaringBitmap::from_sorted_iter(7..23),
|
||||
];
|
||||
|
||||
let medium_data: Vec<_> =
|
||||
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
||||
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(0..23);
|
||||
assert_eq!(bitmap, expected);
|
||||
}
|
||||
}
|
||||
|
@ -93,8 +93,7 @@ pub struct Index {
|
||||
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
||||
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
||||
pub facet_id_string_docids:
|
||||
Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||
pub facet_id_string_docids: Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
||||
|
||||
/// Maps the document id, the facet field id and the numbers.
|
||||
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
||||
|
@ -13,11 +13,9 @@ mod search;
|
||||
pub mod tree_level;
|
||||
pub mod update;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::hash::BuildHasherDefault;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
@ -54,8 +52,6 @@ pub type FieldId = u16;
|
||||
pub type Position = u32;
|
||||
pub type FieldDistribution = BTreeMap<String, u64>;
|
||||
|
||||
type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
|
||||
|
||||
/// Transform a raw obkv store into a JSON Object.
|
||||
pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
|
@ -2,8 +2,8 @@ use std::cmp;
|
||||
|
||||
use crate::{Attribute, Position};
|
||||
|
||||
const ONE_ATTRIBUTE: u32 = 1000;
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
pub const ONE_ATTRIBUTE: u32 = 1000;
|
||||
pub const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs <= rhs {
|
||||
|
@ -461,13 +461,18 @@ fn query_pair_proximity_docids(
|
||||
let prefix = right.prefix;
|
||||
match (&left.kind, &right.kind) {
|
||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
Ok(ctx
|
||||
.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
|
||||
.unwrap_or_default())
|
||||
} else if prefix {
|
||||
if prefix {
|
||||
match ctx.word_prefix_pair_proximity_docids(
|
||||
left.as_str(),
|
||||
right.as_str(),
|
||||
proximity,
|
||||
)? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(ctx
|
||||
.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
|
||||
@ -477,22 +482,24 @@ fn query_pair_proximity_docids(
|
||||
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
|
||||
let l_words =
|
||||
word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||
if prefix && ctx.in_prefix_cache(&right) {
|
||||
if prefix {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for (left, _) in l_words {
|
||||
let current_docids = ctx
|
||||
.word_prefix_pair_proximity_docids(
|
||||
left.as_ref(),
|
||||
right.as_ref(),
|
||||
let current_docids = match ctx.word_prefix_pair_proximity_docids(
|
||||
left.as_str(),
|
||||
right.as_str(),
|
||||
proximity,
|
||||
)?
|
||||
.unwrap_or_default();
|
||||
)? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => {
|
||||
let r_words =
|
||||
word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
|
||||
}
|
||||
}?;
|
||||
docids |= current_docids;
|
||||
}
|
||||
Ok(docids)
|
||||
} else if prefix {
|
||||
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
|
||||
} else {
|
||||
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||
}
|
||||
|
@ -269,11 +269,7 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> {
|
||||
///
|
||||
/// It yields the facet string and the roaring bitmap associated with it.
|
||||
pub struct FacetStringLevelZeroRange<'t> {
|
||||
iter: RoRange<
|
||||
't,
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringLevelZeroRange<'t> {
|
||||
@ -316,10 +312,7 @@ impl<'t> FacetStringLevelZeroRange<'t> {
|
||||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.range(rtxn, &(left_bound, right_bound))?
|
||||
.remap_types::<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
|
||||
>();
|
||||
.remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
|
||||
|
||||
Ok(FacetStringLevelZeroRange { iter })
|
||||
}
|
||||
@ -340,11 +333,7 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
||||
}
|
||||
|
||||
pub struct FacetStringLevelZeroRevRange<'t> {
|
||||
iter: RoRevRange<
|
||||
't,
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringLevelZeroRevRange<'t> {
|
||||
@ -387,10 +376,7 @@ impl<'t> FacetStringLevelZeroRevRange<'t> {
|
||||
let iter = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.rev_range(rtxn, &(left_bound, right_bound))?
|
||||
.remap_types::<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
|
||||
>();
|
||||
.remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
|
||||
|
||||
Ok(FacetStringLevelZeroRevRange { iter })
|
||||
}
|
||||
|
@ -392,10 +392,7 @@ impl FilterCondition {
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
strings_db: heed::Database<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
||||
field_id: FieldId,
|
||||
operator: &Operator,
|
||||
) -> Result<RoaringBitmap> {
|
||||
|
@ -490,7 +490,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
|
||||
None => {
|
||||
// The key corresponds to a level zero facet string.
|
||||
let (original_value, mut docids) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val)
|
||||
FacetStringLevelZeroValueCodec::bytes_decode(val)
|
||||
.ok_or_else(|| SerializationError::Decoding { db_name })?;
|
||||
|
||||
let previous_len = docids.len();
|
||||
@ -501,8 +501,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
|
||||
} else if docids.len() != previous_len {
|
||||
let key = key.to_owned();
|
||||
let val = &(original_value, docids);
|
||||
let value_bytes =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val)
|
||||
let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val)
|
||||
.ok_or_else(|| SerializationError::Encoding { db_name })?;
|
||||
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
|
@ -3,7 +3,7 @@ use std::num::{NonZeroU8, NonZeroUsize};
|
||||
use std::{cmp, mem};
|
||||
|
||||
use chrono::Utc;
|
||||
use grenad::{CompressionType, FileFuse, Reader, Writer};
|
||||
use grenad::{CompressionType, Reader, Writer};
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesEncode, Error};
|
||||
use log::debug;
|
||||
@ -25,7 +25,6 @@ pub struct Facets<'t, 'u, 'i> {
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
_update_id: u64,
|
||||
@ -42,7 +41,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
level_group_size: NonZeroUsize::new(4).unwrap(),
|
||||
min_level_size: NonZeroUsize::new(5).unwrap(),
|
||||
_update_id: update_id,
|
||||
@ -59,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("Facets::{}")]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||
// We get the faceted fields to be able to create the facet levels.
|
||||
@ -86,7 +85,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
self.index.facet_id_string_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
field_id,
|
||||
@ -107,7 +105,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
self.index.facet_id_f64_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
field_id,
|
||||
@ -128,7 +125,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.facet_id_f64_docids.as_polymorph(),
|
||||
facet_number_levels,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }),
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?,
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
|
||||
@ -136,7 +133,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.facet_id_string_docids.as_polymorph(),
|
||||
facet_string_levels,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }),
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?,
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
}
|
||||
@ -161,11 +158,10 @@ fn compute_facet_number_levels<'t>(
|
||||
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||
compression_type: CompressionType,
|
||||
compression_level: Option<u32>,
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
field_id: FieldId,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
) -> Result<Reader<File>> {
|
||||
let first_level_size = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &field_id.to_be_bytes())?
|
||||
@ -219,7 +215,7 @@ fn compute_facet_number_levels<'t>(
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer, shrink_size)
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
fn write_number_entry(
|
||||
@ -239,7 +235,7 @@ fn write_number_entry(
|
||||
|
||||
fn compute_faceted_strings_documents_ids(
|
||||
rtxn: &heed::RoTxn,
|
||||
db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||
db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec>,
|
||||
field_id: FieldId,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
@ -278,17 +274,13 @@ fn clear_field_string_levels<'t>(
|
||||
|
||||
fn compute_facet_string_levels<'t>(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: heed::Database<
|
||||
FacetStringLevelZeroCodec,
|
||||
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
||||
compression_type: CompressionType,
|
||||
compression_level: Option<u32>,
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroUsize,
|
||||
min_level_size: NonZeroUsize,
|
||||
field_id: FieldId,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
) -> Result<Reader<File>> {
|
||||
let first_level_size = db
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(rtxn, &field_id.to_be_bytes())?
|
||||
@ -340,7 +332,7 @@ fn compute_facet_string_levels<'t>(
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer, shrink_size)
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
fn write_string_entry(
|
||||
|
@ -0,0 +1,167 @@
|
||||
use std::collections::HashSet;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use meilisearch_tokenizer::token::SeparatorKind;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::proximity::ONE_ATTRIBUTE;
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
///
|
||||
/// Returns the generated internal documents ids and a grenad reader
|
||||
/// with the list of extracted words from the given chunk of documents.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_docid_word_positions<R: io::Read>(
|
||||
mut obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
concat_u32s_array,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut field_buffer = String::new();
|
||||
let mut config = AnalyzerConfig::default();
|
||||
if let Some(stop_words) = stop_words {
|
||||
config.stop_words(stop_words);
|
||||
}
|
||||
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
|
||||
|
||||
while let Some((key, value)) = obkv_documents.next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = obkv::KvReader::<FieldId>::new(value);
|
||||
|
||||
documents_ids.push(document_id);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||
let analyzed = analyzer.analyze(field);
|
||||
let tokens = process_tokens(analyzed.tokens())
|
||||
.take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE);
|
||||
|
||||
for (index, token) in tokens {
|
||||
let token = token.text().trim();
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u32 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = field_id as u32 * ONE_ATTRIBUTE + position;
|
||||
docid_word_positions_sorter.insert(&key_buffer, &position.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
Value::Object(object) => {
|
||||
let mut buffer = String::new();
|
||||
let mut count = 0;
|
||||
for (key, value) in object {
|
||||
buffer.clear();
|
||||
let _ = write!(&mut buffer, "{}: ", key);
|
||||
if inner(value, &mut buffer) {
|
||||
buffer.push_str(". ");
|
||||
// We write the "key: value. " pair only when
|
||||
// we are sure that the value can be written.
|
||||
output.push_str(&buffer);
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Value::String(string) = value {
|
||||
Some(&string)
|
||||
} else if inner(value, buffer) {
|
||||
Some(buffer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standart proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec};
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet numbers and
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_facet_number_docids<R: io::Read>(
|
||||
mut docid_fid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
while let Some((key_bytes, _)) = docid_fid_facet_number.next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = (field_id, 0, number, number);
|
||||
let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap();
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
use std::fs::File;
|
||||
use std::iter::FromIterator;
|
||||
use std::{io, str};
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
};
|
||||
use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec};
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_facet_string_docids<R: io::Read>(
|
||||
mut docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
keep_first_prefix_value_merge_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let original_value = str::from_utf8(original_value_bytes)?;
|
||||
|
||||
key_buffer.clear();
|
||||
FacetStringLevelZeroCodec::serialize_into(
|
||||
field_id,
|
||||
str::from_utf8(normalized_value_bytes)?,
|
||||
&mut key_buffer,
|
||||
);
|
||||
|
||||
value_buffer.clear();
|
||||
encode_prefix_string(original_value, &mut value_buffer)?;
|
||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||
bitmap.serialize_into(&mut value_buffer)?;
|
||||
|
||||
facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||
}
|
@ -0,0 +1,120 @@
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::zerocopy::AsBytes;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read>(
|
||||
mut obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
while let Some((docid_bytes, value)) = obkv_documents.next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let (numbers, strings) = extract_facet_values(&value);
|
||||
|
||||
key_buffer.clear();
|
||||
|
||||
// prefix key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
key_buffer.extend_from_slice(&docid_bytes);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized.as_bytes());
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
|
||||
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
))
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
let normalized = original.trim().to_lowercase();
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
}
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
||||
let mut facet_number_values = Vec::new();
|
||||
let mut facet_string_values = Vec::new();
|
||||
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
|
||||
|
||||
(facet_number_values, facet_string_values)
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::{cmp, io};
|
||||
|
||||
use grenad::Sorter;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters, MergeFn,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::extract_position;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted field id word counts
|
||||
/// and documents ids from the given chunk of docid word positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_fid_word_count_docids<R: io::Read>(
|
||||
mut docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
// This map is assumed to not consume a lot of memory.
|
||||
let mut document_fid_wordcount = HashMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
while let Some((key, value)) = docid_word_positions.next()? {
|
||||
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
||||
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||
if curr_document_id != document_id {
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
curr_document_id,
|
||||
)?;
|
||||
current_document_id = Some(document_id);
|
||||
}
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
let (field_id, position) = extract_position(position);
|
||||
let word_count = position + 1;
|
||||
|
||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||
*value = cmp::max(*value, word_count);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// We must make sure that don't lose the current document field id
|
||||
// word count map if we break because we reached the end of the chunk.
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
document_id,
|
||||
)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
||||
|
||||
fn drain_document_fid_wordcount_into_sorter(
|
||||
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
||||
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
||||
document_id: DocumentId,
|
||||
) -> Result<()> {
|
||||
let mut key_buffer = Vec::new();
|
||||
|
||||
for (fid, count) in document_fid_wordcount.drain() {
|
||||
if count <= 10 {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
key_buffer.push(count as u8);
|
||||
|
||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_docids<R: io::Read>(
|
||||
mut docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
merge_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut value_buffer = Vec::new();
|
||||
while let Some((key, _value)) = docid_word_positions.next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(word_docids_sorter, indexer)
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words at positions and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_level_position_docids<R: io::Read>(
|
||||
mut docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_level_position_docids_sorter = create_sorter(
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
while let Some((key, value)) = docid_word_positions.next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0); // tree level
|
||||
|
||||
// Levels are composed of left and right bounds.
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
|
||||
word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(word_level_position_docids_sorter, indexer)
|
||||
}
|
@ -0,0 +1,174 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::{cmp, io, mem, str, vec};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters, MergeFn,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{positions_proximity, MAX_DISTANCE};
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted word pairs proximities and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read>(
|
||||
mut docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
// This map is assumed to not consume a lot of memory.
|
||||
let mut document_word_positions_heap = BinaryHeap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
while let Some((key, value)) = docid_word_positions.next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let word = str::from_utf8(word_bytes)?;
|
||||
|
||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||
if curr_document_id != document_id {
|
||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||
document_word_positions_into_sorter(
|
||||
curr_document_id,
|
||||
document_word_positions_heap,
|
||||
&mut word_pair_proximity_docids_sorter,
|
||||
)?;
|
||||
current_document_id = Some(document_id);
|
||||
}
|
||||
|
||||
let word = word.to_string();
|
||||
let mut iter = read_u32_ne_bytes(value).collect::<Vec<_>>().into_iter();
|
||||
if let Some(position) = iter.next() {
|
||||
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// We must make sure that don't lose the current document field id
|
||||
// word count map if we break because we reached the end of the chunk.
|
||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
document_word_positions_heap,
|
||||
&mut word_pair_proximity_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
|
||||
}
|
||||
|
||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
///
|
||||
/// This list is used by the engine to calculate the documents containing words that are
|
||||
/// close to each other.
|
||||
fn document_word_positions_into_sorter<'b>(
|
||||
document_id: DocumentId,
|
||||
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
|
||||
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
let mut word_pair_proximity = HashMap::new();
|
||||
let mut ordered_peeked_word_positions = Vec::new();
|
||||
while !word_positions_heap.is_empty() {
|
||||
while let Some(peeked_word_position) = word_positions_heap.pop() {
|
||||
ordered_peeked_word_positions.push(peeked_word_position);
|
||||
if ordered_peeked_word_positions.len() == 7 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
|
||||
for PeekedWordPosition { word, position, .. } in tail {
|
||||
let prox = positions_proximity(head.position, *position);
|
||||
if prox > 0 && prox < MAX_DISTANCE {
|
||||
word_pair_proximity
|
||||
.entry((head.word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
|
||||
// We also compute the inverse proximity.
|
||||
let prox = prox + 1;
|
||||
if prox < MAX_DISTANCE {
|
||||
word_pair_proximity
|
||||
.entry((word.clone(), head.word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Push the tail in the heap.
|
||||
let tail_iter = ordered_peeked_word_positions.drain(1..);
|
||||
word_positions_heap.extend(tail_iter);
|
||||
|
||||
// Advance the head and push it in the heap.
|
||||
if let Some(mut head) = ordered_peeked_word_positions.pop() {
|
||||
if let Some(next_position) = head.iter.next() {
|
||||
word_positions_heap.push(PeekedWordPosition {
|
||||
word: head.word,
|
||||
position: next_position,
|
||||
iter: head.iter,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
for ((w1, w2), prox) in word_pair_proximity {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
key_buffer.push(prox as u8);
|
||||
|
||||
word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct PeekedWordPosition<I> {
|
||||
word: String,
|
||||
position: u32,
|
||||
iter: I,
|
||||
}
|
||||
|
||||
impl<I> Ord for PeekedWordPosition<I> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.position.cmp(&other.position).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> PartialOrd for PeekedWordPosition<I> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Eq for PeekedWordPosition<I> {}
|
||||
|
||||
impl<I> PartialEq for PeekedWordPosition<I> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.position == other.position
|
||||
}
|
||||
}
|
230
milli/src/update/index_documents/extract/mod.rs
Normal file
230
milli/src/update/index_documents/extract/mod.rs
Normal file
@ -0,0 +1,230 @@
|
||||
mod extract_docid_word_positions;
|
||||
mod extract_facet_number_docids;
|
||||
mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_level_position_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use log::debug;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_level_position_docids::extract_word_level_position_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use super::helpers::{
|
||||
into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|
||||
merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
||||
};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
) -> Result<()> {
|
||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|result| {
|
||||
extract_documents_data(
|
||||
result,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
&stop_words,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
||||
) = result?;
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordcountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_docids,
|
||||
merge_roaring_bitmaps,
|
||||
TypedChunk::WordDocids,
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_level_position_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordLevelPositionDocids,
|
||||
"word-level-position-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_fid_facet_strings_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
keep_first_prefix_value_merge_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task(
|
||||
docid_fid_facet_numbers_chunks.clone(),
|
||||
indexer.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn spawn_extraction_task<FE, FS>(
|
||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
merge_fn: MergeFn,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static,
|
||||
{
|
||||
rayon::spawn(move || {
|
||||
let chunks: Result<Vec<_>> =
|
||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
|
||||
rayon::spawn(move || match chunks {
|
||||
Ok(chunks) => {
|
||||
debug!("merge {} database", name);
|
||||
let reader = merge_readers(chunks, merge_fn, indexer);
|
||||
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
/// Extract chuncked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
/// - documents_ids
|
||||
/// - docid_word_positions
|
||||
/// - docid_fid_facet_numbers
|
||||
/// - docid_fid_facet_strings
|
||||
fn extract_documents_data(
|
||||
documents_chunk: Result<grenad::Reader<File>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?;
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone())));
|
||||
|
||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
|
||||
documents_chunk.clone(),
|
||||
indexer.clone(),
|
||||
searchable_fields,
|
||||
stop_words.as_ref(),
|
||||
)?;
|
||||
|
||||
// send documents_ids to DB writer
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { into_clonable_grenad(docid_word_positions_chunk)? };
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
|
||||
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
|
||||
extract_fid_docid_facet_values(
|
||||
documents_chunk.clone(),
|
||||
indexer.clone(),
|
||||
faceted_fields,
|
||||
)?;
|
||||
|
||||
// send docid_fid_facet_numbers_chunk to DB writer
|
||||
let docid_fid_facet_numbers_chunk =
|
||||
unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||
docid_fid_facet_numbers_chunk.clone(),
|
||||
)));
|
||||
|
||||
// send docid_fid_facet_strings_chunk to DB writer
|
||||
let docid_fid_facet_strings_chunk =
|
||||
unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||
docid_fid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
|
||||
}
|
24
milli/src/update/index_documents/helpers/clonable_mmap.rs
Normal file
24
milli/src/update/index_documents/helpers/clonable_mmap.rs
Normal file
@ -0,0 +1,24 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use memmap::Mmap;
|
||||
|
||||
/// Wrapper around Mmap allowing to virtualy clone grenad-chunks
|
||||
/// in a parallel process like the indexing.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClonableMmap {
|
||||
inner: Arc<Mmap>,
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ClonableMmap {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.inner.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Mmap> for ClonableMmap {
|
||||
fn from(inner: Mmap) -> ClonableMmap {
|
||||
ClonableMmap { inner: Arc::new(inner) }
|
||||
}
|
||||
}
|
||||
|
||||
pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;
|
262
milli/src/update/index_documents/helpers/grenad_helpers.rs
Normal file
262
milli/src/update/index_documents/helpers/grenad_helpers.rs
Normal file
@ -0,0 +1,262 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, MergerIter, Reader, Sorter};
|
||||
use heed::types::ByteSlice;
|
||||
use log::debug;
|
||||
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::error::InternalError;
|
||||
use crate::update::index_documents::WriteMethod;
|
||||
use crate::Result;
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
|
||||
pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> io::Result<grenad::Writer<R>> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(file)
|
||||
}
|
||||
|
||||
pub fn create_sorter(
|
||||
merge: MergeFn,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> grenad::Sorter<MergeFn> {
|
||||
let mut builder = grenad::Sorter::builder(merge);
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
builder.chunk_compression_level(level);
|
||||
}
|
||||
if let Some(nb_chunks) = max_nb_chunks {
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.dump_threshold(memory);
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
|
||||
})?;
|
||||
sorter.write_into(&mut writer)?;
|
||||
Ok(writer_into_reader(writer)?)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
grenad::Reader::new(file).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub unsafe fn into_clonable_grenad(
|
||||
reader: grenad::Reader<File>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.into_inner();
|
||||
let mmap = memmap::Mmap::map(&file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
pub fn merge_readers<R: io::Read>(
|
||||
readers: Vec<grenad::Reader<R>>,
|
||||
merge_fn: MergeFn,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
|
||||
merger_builder.extend(readers);
|
||||
let merger = merger_builder.build();
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
|
||||
})?;
|
||||
merger.write_into(&mut writer)?;
|
||||
let reader = writer_into_reader(writer)?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for GrenadParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_memory: None,
|
||||
max_nb_chunks: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GrenadParameters {
|
||||
/// This function use the number of threads in the current threadpool to compute the value.
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// Otherwise, it will take the global number of threads.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads())
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator that outputs grenad readers of obkv documents
|
||||
/// with a maximum size of approximately `documents_chunks_size`.
|
||||
///
|
||||
/// The grenad obkv entries are composed of an incremental document id big-endian
|
||||
/// encoded as the key and an obkv object with an `u8` for the field as the key
|
||||
/// and a simple UTF-8 encoded string as the value.
|
||||
pub fn grenad_obkv_into_chunks<R: io::Read>(
|
||||
mut reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
|
||||
let mut continue_reading = true;
|
||||
|
||||
let indexer_clone = indexer.clone();
|
||||
let mut transposer = move || {
|
||||
if !continue_reading {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut current_chunk_size = 0u64;
|
||||
let mut obkv_documents = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(
|
||||
indexer_clone.chunk_compression_type,
|
||||
indexer_clone.chunk_compression_level,
|
||||
file,
|
||||
)
|
||||
})?;
|
||||
|
||||
while let Some((document_id, obkv)) = reader.next()? {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
|
||||
continue_reading = false;
|
||||
writer_into_reader(obkv_documents).map(Some)
|
||||
};
|
||||
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
pub fn write_into_lmdb_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
mut reader: Reader<File>,
|
||||
merge: MergeFn,
|
||||
method: WriteMethod,
|
||||
) -> Result<()> {
|
||||
debug!("Writing MTBL stores...");
|
||||
let before = Instant::now();
|
||||
|
||||
match method {
|
||||
WriteMethod::Append => {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
|
||||
let val = merge(k, &vals)?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn sorter_into_lmdb_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
sorter: Sorter<MergeFn>,
|
||||
merge: MergeFn,
|
||||
method: WriteMethod,
|
||||
) -> Result<()> {
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn merger_iter_into_lmdb_database<R: io::Read>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
mut sorter: MergerIter<R, MergeFn>,
|
||||
merge: MergeFn,
|
||||
method: WriteMethod,
|
||||
) -> Result<()> {
|
||||
match method {
|
||||
WriteMethod::Append => {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
||||
let val = merge(k, &vals).map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
130
milli/src/update/index_documents/helpers/merge_functions.rs
Normal file
130
milli/src/update/index_documents/helpers/merge_functions.rs
Normal file
@ -0,0 +1,130 @@
|
||||
use std::borrow::Cow;
|
||||
use std::io;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::read_u32_ne_bytes;
|
||||
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::Result;
|
||||
|
||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||
|
||||
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let capacity = values.iter().map(|v| v.len()).sum::<usize>();
|
||||
let mut output = Vec::with_capacity(capacity);
|
||||
values.iter().for_each(|integers| output.extend_from_slice(integers));
|
||||
Ok(Cow::Owned(output))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
|
||||
read_u32_ne_bytes(slice).collect()
|
||||
}
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
buffer.reserve(bitmap.serialized_size());
|
||||
bitmap.serialize_into(buffer)
|
||||
}
|
||||
|
||||
pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let original = decode_prefix_string(&values[0]).unwrap().0;
|
||||
let merged_bitmaps = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(decode_prefix_string)
|
||||
.map(Option::unwrap)
|
||||
.map(|(_, bitmap_bytes)| bitmap_bytes)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
|
||||
let cap = std::mem::size_of::<u16>() + original.len() + merged_bitmaps.serialized_size();
|
||||
let mut buffer = Vec::with_capacity(cap);
|
||||
encode_prefix_string(original, &mut buffer)?;
|
||||
merged_bitmaps.serialize_into(&mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
|
||||
/// Merge all the obks in the order we see them.
|
||||
pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs
|
||||
.into_iter()
|
||||
.cloned()
|
||||
.reduce(|acc, current| {
|
||||
let first = obkv::KvReader::new(&acc);
|
||||
let second = obkv::KvReader::new(¤t);
|
||||
let mut buffer = Vec::new();
|
||||
merge_two_obkvs(first, second, &mut buffer);
|
||||
Cow::from(buffer)
|
||||
})
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
45
milli/src/update/index_documents/helpers/mod.rs
Normal file
45
milli/src/update/index_documents/helpers/mod.rs
Normal file
@ -0,0 +1,45 @@
|
||||
mod clonable_mmap;
|
||||
mod grenad_helpers;
|
||||
mod merge_functions;
|
||||
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
pub use grenad_helpers::{
|
||||
create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers,
|
||||
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
||||
GrenadParameters,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
|
||||
merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs,
|
||||
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
|
||||
};
|
||||
|
||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||
key.as_ref().len() <= 511
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
||||
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
||||
}
|
@ -1,106 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use fst::IntoStreamer;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::Result;
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
Ok(obkvs.last().unwrap().clone().into_owned())
|
||||
}
|
||||
|
||||
/// Merge all the obks in the order we see them.
|
||||
pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let mut iter = obkvs.iter();
|
||||
let first = iter.next().map(|b| b.clone().into_owned()).unwrap();
|
||||
Ok(iter.fold(first, |acc, current| {
|
||||
let first = obkv::KvReader::new(&acc);
|
||||
let second = obkv::KvReader::new(current);
|
||||
let mut buffer = Vec::new();
|
||||
merge_two_obkvs(first, second, &mut buffer);
|
||||
buffer
|
||||
}))
|
||||
}
|
||||
|
||||
// Union of multiple FSTs
|
||||
pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let fsts = values.iter().map(fst::Set::new).collect::<StdResult<Vec<_>, _>>()?;
|
||||
let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect();
|
||||
let op = op_builder.r#union();
|
||||
|
||||
let mut build = fst::SetBuilder::memory();
|
||||
build.extend_stream(op.into_stream()).unwrap();
|
||||
Ok(build.into_inner().unwrap())
|
||||
}
|
||||
|
||||
pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
Ok(values.first().unwrap().to_vec())
|
||||
}
|
||||
|
||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
let mut head = RoaringBitmap::deserialize_from(&head[..])?;
|
||||
|
||||
for value in tail {
|
||||
head |= RoaringBitmap::deserialize_from(&value[..])?;
|
||||
}
|
||||
|
||||
let mut vec = Vec::with_capacity(head.serialized_size());
|
||||
head.serialize_into(&mut vec)?;
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
/// Uses the FacetStringLevelZeroValueCodec to merge the values.
|
||||
pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
let (head_string, mut head_rb) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..])
|
||||
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||
|
||||
for value in tail {
|
||||
let (_string, rb) =
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..])
|
||||
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||
head_rb |= rb;
|
||||
}
|
||||
|
||||
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb))
|
||||
.map(|cow| cow.into_owned())
|
||||
.ok_or(SerializationError::Encoding { db_name: None })
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||
let (head, tail) = values.split_first().unwrap();
|
||||
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
|
||||
|
||||
for value in tail {
|
||||
head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?;
|
||||
}
|
||||
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::serialize_into(&head, &mut vec);
|
||||
Ok(vec)
|
||||
}
|
@ -1,240 +1,44 @@
|
||||
use std::borrow::Cow;
|
||||
mod extract;
|
||||
mod helpers;
|
||||
mod transform;
|
||||
mod typed_chunk;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::result::Result as StdResult;
|
||||
use std::str;
|
||||
use std::sync::mpsc::sync_channel;
|
||||
use std::time::Instant;
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use chrono::Utc;
|
||||
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
|
||||
use heed::types::ByteSlice;
|
||||
use log::{debug, error, info};
|
||||
use memmap::Mmap;
|
||||
use rayon::prelude::*;
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use grenad::{self, CompressionType};
|
||||
use log::{debug, info};
|
||||
use rayon::ThreadPool;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
|
||||
pub use self::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
pub use self::helpers::{
|
||||
create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn,
|
||||
};
|
||||
use self::store::{Readers, Store};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use super::UpdateBuilder;
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::update::{
|
||||
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
|
||||
Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
|
||||
WordsLevelPositions, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, MergeFn, Result};
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod merge_function;
|
||||
mod store;
|
||||
mod transform;
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct DocumentAdditionResult {
|
||||
pub nb_documents: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum WriteMethod {
|
||||
Append,
|
||||
GetMergePut,
|
||||
}
|
||||
|
||||
pub fn create_writer(
|
||||
typ: CompressionType,
|
||||
level: Option<u32>,
|
||||
file: File,
|
||||
) -> io::Result<Writer<File>> {
|
||||
let mut builder = Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(file)
|
||||
}
|
||||
|
||||
pub fn create_sorter<E>(
|
||||
merge: MergeFn<E>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> Sorter<MergeFn<E>> {
|
||||
let mut builder = Sorter::builder(merge);
|
||||
if let Some(shrink_size) = chunk_fusing_shrink_size {
|
||||
builder.file_fusing_shrink_size(shrink_size);
|
||||
}
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
builder.chunk_compression_level(level);
|
||||
}
|
||||
if let Some(nb_chunks) = max_nb_chunks {
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.max_memory(memory);
|
||||
}
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(
|
||||
writer: Writer<File>,
|
||||
shrink_size: Option<u64>,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
let file = if let Some(shrink_size) = shrink_size {
|
||||
FileFuse::builder().shrink_size(shrink_size).build(file)
|
||||
} else {
|
||||
FileFuse::new(file)
|
||||
};
|
||||
Reader::new(file).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn merge_readers<E>(
|
||||
sources: Vec<Reader<FileFuse>>,
|
||||
merge: MergeFn<E>,
|
||||
) -> Merger<FileFuse, MergeFn<E>> {
|
||||
let mut builder = Merger::builder(merge);
|
||||
builder.extend(sources);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn merge_into_lmdb_database<E>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
sources: Vec<Reader<FileFuse>>,
|
||||
merge: MergeFn<E>,
|
||||
method: WriteMethod,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
debug!("Merging {} MTBL stores...", sources.len());
|
||||
let before = Instant::now();
|
||||
|
||||
let merger = merge_readers(sources, merge);
|
||||
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write_into_lmdb_database<E>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
mut reader: Reader<FileFuse>,
|
||||
merge: MergeFn<E>,
|
||||
method: WriteMethod,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
debug!("Writing MTBL stores...");
|
||||
let before = Instant::now();
|
||||
|
||||
match method {
|
||||
WriteMethod::Append => {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
|
||||
let val = merge(k, &vals)?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn sorter_into_lmdb_database<E>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
sorter: Sorter<MergeFn<E>>,
|
||||
merge: MergeFn<E>,
|
||||
method: WriteMethod,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
Error: From<grenad::Error<E>>,
|
||||
{
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn merger_iter_into_lmdb_database<R: io::Read, E>(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
mut sorter: MergerIter<R, MergeFn<E>>,
|
||||
merge: MergeFn<E>,
|
||||
method: WriteMethod,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
match method {
|
||||
WriteMethod::Append => {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
||||
let val = merge(k, &vals).map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub enum IndexDocumentsMethod {
|
||||
@ -247,6 +51,12 @@ pub enum IndexDocumentsMethod {
|
||||
UpdateDocuments,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum WriteMethod {
|
||||
Append,
|
||||
GetMergePut,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[non_exhaustive]
|
||||
pub enum UpdateFormat {
|
||||
@ -262,16 +72,15 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
pub(crate) log_every_n: Option<usize>,
|
||||
pub(crate) documents_chunk_size: Option<usize>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
pub(crate) linked_hash_map_size: Option<usize>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||
facet_level_group_size: Option<NonZeroUsize>,
|
||||
facet_min_level_size: Option<NonZeroUsize>,
|
||||
words_prefix_threshold: Option<f64>,
|
||||
words_prefix_threshold: Option<u32>,
|
||||
max_prefix_length: Option<usize>,
|
||||
words_positions_level_group_size: Option<NonZeroU32>,
|
||||
words_positions_min_level_size: Option<NonZeroU32>,
|
||||
@ -291,12 +100,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
wtxn,
|
||||
index,
|
||||
log_every_n: None,
|
||||
documents_chunk_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
linked_hash_map_size: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
thread_pool: None,
|
||||
facet_level_group_size: None,
|
||||
facet_min_level_size: None,
|
||||
@ -327,6 +135,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
self.autogenerate_docids = false;
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> Result<DocumentAdditionResult>
|
||||
where
|
||||
R: io::Read,
|
||||
@ -344,14 +153,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
let before_transform = Instant::now();
|
||||
let update_id = self.update_id;
|
||||
let progress_callback = |step| progress_callback(step, update_id);
|
||||
|
||||
let transform = Transform {
|
||||
rtxn: &self.wtxn,
|
||||
index: self.index,
|
||||
log_every_n: self.log_every_n,
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
||||
max_nb_chunks: self.max_nb_chunks,
|
||||
max_memory: self.max_memory,
|
||||
index_documents_method: self.update_method,
|
||||
@ -374,12 +181,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
Ok(DocumentAdditionResult { nb_documents })
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let before_indexing = Instant::now();
|
||||
|
||||
let TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
@ -395,6 +201,78 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
// up to date field map.
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
|
||||
let backup_pool;
|
||||
let pool = match self.thread_pool {
|
||||
Some(pool) => pool,
|
||||
#[cfg(not(test))]
|
||||
None => {
|
||||
// We initialize a bakcup pool with the default
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
|
||||
&backup_pool
|
||||
}
|
||||
#[cfg(test)]
|
||||
None => {
|
||||
// We initialize a bakcup pool with the default
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
|
||||
&backup_pool
|
||||
}
|
||||
};
|
||||
|
||||
let documents_file = grenad::Reader::new(documents_file)?;
|
||||
|
||||
// create LMDB writer channel
|
||||
let (lmdb_writer_sx, lmdb_writer_rx): (
|
||||
Sender<Result<TypedChunk>>,
|
||||
Receiver<Result<TypedChunk>>,
|
||||
) = crossbeam_channel::unbounded();
|
||||
|
||||
// get searchable fields for word databases
|
||||
let searchable_fields =
|
||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
||||
// get filterable fields for facet databases
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
// let stop_words = stop_words.as_ref();
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
pool.install(|| {
|
||||
let params = GrenadParameters {
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
max_memory: self.max_memory,
|
||||
max_nb_chunks: self.max_nb_chunks, // default value, may be chosen.
|
||||
};
|
||||
|
||||
// split obkv file into several chuncks
|
||||
let chunk_iter = grenad_obkv_into_chunks(
|
||||
documents_file,
|
||||
params.clone(),
|
||||
self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB
|
||||
);
|
||||
|
||||
let result = chunk_iter.map(|chunk_iter| {
|
||||
// extract all databases from the chunked obkv douments
|
||||
extract::data_from_obkv_documents(
|
||||
chunk_iter,
|
||||
params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
stop_words,
|
||||
)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be droped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx)
|
||||
});
|
||||
|
||||
// We delete the documents that this document addition replaces. This way we are
|
||||
// able to simply insert all the documents even if they already exist in the database.
|
||||
if !replaced_documents_ids.is_empty() {
|
||||
@ -402,10 +280,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
log_every_n: self.log_every_n,
|
||||
max_nb_chunks: self.max_nb_chunks,
|
||||
max_memory: self.max_memory,
|
||||
linked_hash_map_size: self.linked_hash_map_size,
|
||||
documents_chunk_size: self.documents_chunk_size,
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
||||
thread_pool: self.thread_pool,
|
||||
update_id: self.update_id,
|
||||
};
|
||||
@ -416,189 +293,39 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
debug!("{} documents actually deleted", deleted_documents_count);
|
||||
}
|
||||
|
||||
if documents_count == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let index_is_empty = index_documents_ids.len() == 0;
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
|
||||
let bytes = unsafe { Mmap::map(&documents_file)? };
|
||||
let documents = grenad::Reader::new(bytes.as_bytes()).unwrap();
|
||||
|
||||
// The enum which indicates the type of the readers
|
||||
// merges that are potentially done on different threads.
|
||||
enum DatabaseType {
|
||||
Main,
|
||||
WordDocids,
|
||||
WordLevel0PositionDocids,
|
||||
FieldIdWordCountDocids,
|
||||
FacetLevel0NumbersDocids,
|
||||
}
|
||||
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? {
|
||||
Some(fields) => fields.iter().copied().collect(),
|
||||
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
|
||||
};
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let stop_words = stop_words.as_ref();
|
||||
let linked_hash_map_size = self.linked_hash_map_size;
|
||||
let max_nb_chunks = self.max_nb_chunks;
|
||||
let max_memory = self.max_memory;
|
||||
let chunk_compression_type = self.chunk_compression_type;
|
||||
let chunk_compression_level = self.chunk_compression_level;
|
||||
let log_every_n = self.log_every_n;
|
||||
let chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
|
||||
let backup_pool;
|
||||
let pool = match self.thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
// We initialize a bakcup pool with the default
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
|
||||
&backup_pool
|
||||
}
|
||||
};
|
||||
|
||||
let readers = pool.install(|| {
|
||||
let num_threads = rayon::current_num_threads();
|
||||
let max_memory_by_job = max_memory.map(|mm| mm / num_threads);
|
||||
|
||||
let readers = rayon::iter::repeatn(documents, num_threads)
|
||||
.enumerate()
|
||||
.map(|(i, documents)| {
|
||||
let store = Store::new(
|
||||
searchable_fields.clone(),
|
||||
faceted_fields.clone(),
|
||||
linked_hash_map_size,
|
||||
max_nb_chunks,
|
||||
max_memory_by_job,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
stop_words,
|
||||
)?;
|
||||
store.index(
|
||||
documents,
|
||||
documents_count,
|
||||
i,
|
||||
num_threads,
|
||||
log_every_n,
|
||||
&progress_callback,
|
||||
)
|
||||
})
|
||||
.collect::<StdResult<Vec<_>, _>>()?;
|
||||
|
||||
let mut main_readers = Vec::with_capacity(readers.len());
|
||||
let mut word_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
|
||||
let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len());
|
||||
let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len());
|
||||
let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len());
|
||||
let mut documents_readers = Vec::with_capacity(readers.len());
|
||||
readers.into_iter().for_each(|readers| {
|
||||
let Readers {
|
||||
main,
|
||||
word_docids,
|
||||
docid_word_positions,
|
||||
words_pairs_proximities_docids,
|
||||
word_level_position_docids,
|
||||
field_id_word_count_docids,
|
||||
facet_field_numbers_docids,
|
||||
facet_field_strings_docids,
|
||||
field_id_docid_facet_numbers,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
} = readers;
|
||||
main_readers.push(main);
|
||||
word_docids_readers.push(word_docids);
|
||||
docid_word_positions_readers.push(docid_word_positions);
|
||||
words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
|
||||
word_level_position_docids_readers.push(word_level_position_docids);
|
||||
field_id_word_count_docids_readers.push(field_id_word_count_docids);
|
||||
facet_field_numbers_docids_readers.push(facet_field_numbers_docids);
|
||||
facet_field_strings_docids_readers.push(facet_field_strings_docids);
|
||||
field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers);
|
||||
field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings);
|
||||
documents_readers.push(documents);
|
||||
let mut databases_seen = 0;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// This is the function that merge the readers
|
||||
// by using the given merge function.
|
||||
let merge_readers = move |readers, merge| {
|
||||
let mut writer = tempfile::tempfile().and_then(|f| {
|
||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
let merger = merge_readers(readers, merge);
|
||||
merger.write_into(&mut writer)?;
|
||||
writer_into_reader(writer, chunk_fusing_shrink_size)
|
||||
};
|
||||
|
||||
// The enum and the channel which is used to transfert
|
||||
// the readers merges potentially done on another thread.
|
||||
let (sender, receiver) = sync_channel(2);
|
||||
|
||||
debug!("Merging the main, word docids and words pairs proximity docids in parallel...");
|
||||
rayon::spawn(move || {
|
||||
vec![
|
||||
(DatabaseType::Main, main_readers, fst_merge as MergeFn<_>),
|
||||
(DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge),
|
||||
(
|
||||
DatabaseType::FacetLevel0NumbersDocids,
|
||||
facet_field_numbers_docids_readers,
|
||||
cbo_roaring_bitmap_merge,
|
||||
),
|
||||
(
|
||||
DatabaseType::WordLevel0PositionDocids,
|
||||
word_level_position_docids_readers,
|
||||
cbo_roaring_bitmap_merge,
|
||||
),
|
||||
(
|
||||
DatabaseType::FieldIdWordCountDocids,
|
||||
field_id_word_count_docids_readers,
|
||||
cbo_roaring_bitmap_merge,
|
||||
),
|
||||
]
|
||||
.into_par_iter()
|
||||
.for_each(|(dbtype, readers, merge)| {
|
||||
let result = merge_readers(readers, merge);
|
||||
if let Err(e) = sender.send((dbtype, result)) {
|
||||
error!("sender error: {}", e);
|
||||
for typed_chunk in lmdb_writer_rx {
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?;
|
||||
if !docids.is_empty() {
|
||||
final_documents_ids |= docids;
|
||||
let documents_seen_count = final_documents_ids.len();
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: documents_seen_count as usize,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
debug!(
|
||||
"We have seen {} documents on {} total document so far",
|
||||
documents_seen_count, documents_count
|
||||
);
|
||||
}
|
||||
if is_merged_database {
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
});
|
||||
|
||||
Ok((
|
||||
receiver,
|
||||
docid_word_positions_readers,
|
||||
documents_readers,
|
||||
words_pairs_proximities_docids_readers,
|
||||
facet_field_strings_docids_readers,
|
||||
field_id_docid_facet_numbers_readers,
|
||||
field_id_docid_facet_strings_readers,
|
||||
)) as Result<_>
|
||||
})?;
|
||||
|
||||
let (
|
||||
receiver,
|
||||
docid_word_positions_readers,
|
||||
documents_readers,
|
||||
words_pairs_proximities_docids_readers,
|
||||
facet_field_strings_docids_readers,
|
||||
field_id_docid_facet_numbers_readers,
|
||||
field_id_docid_facet_strings_readers,
|
||||
) = readers;
|
||||
|
||||
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let contains_documents = !documents_ids.is_empty();
|
||||
let write_method =
|
||||
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
|
||||
|
||||
debug!("Writing using the write method: {:?}", write_method);
|
||||
}
|
||||
}
|
||||
|
||||
// We write the field distribution into the main database
|
||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||
@ -609,180 +336,24 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
// We write the external documents ids into the main database.
|
||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
||||
|
||||
// We merge the new documents ids with the existing ones.
|
||||
documents_ids |= new_documents_ids;
|
||||
documents_ids |= replaced_documents_ids;
|
||||
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
||||
let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids;
|
||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||
|
||||
let mut database_count = 0;
|
||||
let total_databases = 11;
|
||||
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: 0,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Inserting the docid word positions into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.docid_word_positions.as_polymorph(),
|
||||
docid_word_positions_readers,
|
||||
keep_first,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Inserting the documents into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.documents.as_polymorph(),
|
||||
documents_readers,
|
||||
keep_first,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Writing the facet id string docids into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.facet_id_string_docids.as_polymorph(),
|
||||
facet_field_strings_docids_readers,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Writing the field id docid facet numbers into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.field_id_docid_facet_f64s.as_polymorph(),
|
||||
field_id_docid_facet_numbers_readers,
|
||||
keep_first,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Writing the field id docid facet strings into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.field_id_docid_facet_strings.as_polymorph(),
|
||||
field_id_docid_facet_strings_readers,
|
||||
keep_first,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
debug!("Writing the words pairs proximities docids into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_pair_proximity_docids.as_polymorph(),
|
||||
words_pairs_proximities_docids_readers,
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
|
||||
for (db_type, result) in receiver {
|
||||
let content = result?;
|
||||
match db_type {
|
||||
DatabaseType::Main => {
|
||||
debug!("Writing the main elements into LMDB on disk...");
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
self.index.main,
|
||||
content,
|
||||
fst_merge,
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
}
|
||||
DatabaseType::WordDocids => {
|
||||
debug!("Writing the words docids into LMDB on disk...");
|
||||
let db = *self.index.word_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
db,
|
||||
content,
|
||||
roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
}
|
||||
DatabaseType::FacetLevel0NumbersDocids => {
|
||||
debug!("Writing the facet numbers docids into LMDB on disk...");
|
||||
let db = *self.index.facet_id_f64_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
db,
|
||||
content,
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
}
|
||||
DatabaseType::FieldIdWordCountDocids => {
|
||||
debug!("Writing the field id word count docids into LMDB on disk...");
|
||||
let db = *self.index.field_id_word_count_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
db,
|
||||
content,
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
}
|
||||
DatabaseType::WordLevel0PositionDocids => {
|
||||
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
||||
let db = *self.index.word_level_position_docids.as_polymorph();
|
||||
write_into_lmdb_database(
|
||||
self.wtxn,
|
||||
db,
|
||||
content,
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
}
|
||||
self.execute_prefix_databases(progress_callback)
|
||||
}
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases,
|
||||
});
|
||||
}
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute_prefix_databases<F>(self, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
// Merged databases are already been indexed, we start from this count;
|
||||
let mut databases_seen = MERGED_DATABASE_COUNT;
|
||||
|
||||
// Run the facets update operation.
|
||||
let mut builder = Facets::new(self.wtxn, self.index, self.update_id);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
if let Some(value) = self.facet_level_group_size {
|
||||
builder.level_group_size(value);
|
||||
}
|
||||
@ -791,6 +362,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// Run the words prefixes update operation.
|
||||
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
|
||||
if let Some(value) = self.words_prefix_threshold {
|
||||
@ -801,29 +378,44 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// Run the word prefix docids update operation.
|
||||
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.execute()?;
|
||||
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// Run the word prefix pair proximity docids update operation.
|
||||
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.execute()?;
|
||||
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// Run the words level positions update operation.
|
||||
let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
if let Some(value) = self.words_positions_level_group_size {
|
||||
builder.level_group_size(value);
|
||||
}
|
||||
@ -832,9 +424,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
}
|
||||
builder.execute()?;
|
||||
|
||||
debug_assert_eq!(database_count, total_databases);
|
||||
|
||||
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
|
||||
databases_seen += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,985 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
use std::iter::FromIterator;
|
||||
use std::time::Instant;
|
||||
use std::{cmp, iter};
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use concat_arrays::concat_arrays;
|
||||
use fst::Set;
|
||||
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
||||
use heed::BytesEncode;
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use log::{debug, info, warn};
|
||||
use meilisearch_tokenizer::token::SeparatorKind;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use super::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
};
|
||||
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
||||
use crate::error::{Error, InternalError, SerializationError};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
|
||||
|
||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||
const ONE_KILOBYTE: usize = 1024 * 1024;
|
||||
|
||||
const MAX_POSITION: usize = 1000;
|
||||
const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes();
|
||||
|
||||
pub struct Readers {
|
||||
pub main: Reader<FileFuse>,
|
||||
pub word_docids: Reader<FileFuse>,
|
||||
pub docid_word_positions: Reader<FileFuse>,
|
||||
pub words_pairs_proximities_docids: Reader<FileFuse>,
|
||||
pub word_level_position_docids: Reader<FileFuse>,
|
||||
pub field_id_word_count_docids: Reader<FileFuse>,
|
||||
pub facet_field_numbers_docids: Reader<FileFuse>,
|
||||
pub facet_field_strings_docids: Reader<FileFuse>,
|
||||
pub field_id_docid_facet_numbers: Reader<FileFuse>,
|
||||
pub field_id_docid_facet_strings: Reader<FileFuse>,
|
||||
pub documents: Reader<FileFuse>,
|
||||
}
|
||||
|
||||
pub struct Store<'s, A> {
|
||||
// Indexing parameters
|
||||
searchable_fields: HashSet<FieldId>,
|
||||
filterable_fields: HashSet<FieldId>,
|
||||
// Caches
|
||||
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||
word_docids_limit: usize,
|
||||
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids:
|
||||
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids_limit: usize,
|
||||
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
||||
facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>,
|
||||
facet_field_value_docids_limit: usize,
|
||||
// MTBL parameters
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
// MTBL sorters
|
||||
main_sorter: Sorter<MergeFn<Error>>,
|
||||
word_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
words_pairs_proximities_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
word_level_position_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
field_id_word_count_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
facet_field_numbers_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
facet_field_strings_docids_sorter: Sorter<MergeFn<Error>>,
|
||||
field_id_docid_facet_numbers_sorter: Sorter<MergeFn<Error>>,
|
||||
field_id_docid_facet_strings_sorter: Sorter<MergeFn<Error>>,
|
||||
// MTBL writers
|
||||
docid_word_positions_writer: Writer<File>,
|
||||
documents_writer: Writer<File>,
|
||||
// tokenizer
|
||||
analyzer: Analyzer<'s, A>,
|
||||
}
|
||||
|
||||
impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
pub fn new(
|
||||
searchable_fields: HashSet<FieldId>,
|
||||
filterable_fields: HashSet<FieldId>,
|
||||
linked_hash_map_size: Option<usize>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
stop_words: Option<&'s Set<A>>,
|
||||
) -> Result<Self> {
|
||||
// We divide the max memory by the number of sorter the Store have.
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
|
||||
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
|
||||
|
||||
let main_sorter = create_sorter(
|
||||
fst_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let word_docids_sorter = create_sorter(
|
||||
roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let words_pairs_proximities_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let word_level_position_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let field_id_word_count_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let facet_field_numbers_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let facet_field_strings_docids_sorter = create_sorter(
|
||||
tuple_string_cbo_roaring_bitmap_merge,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
let field_id_docid_facet_numbers_sorter = create_sorter(
|
||||
keep_first,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
Some(1024 * 1024 * 1024), // 1MB
|
||||
);
|
||||
let field_id_docid_facet_strings_sorter = create_sorter(
|
||||
keep_first,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
max_nb_chunks,
|
||||
Some(1024 * 1024 * 1024), // 1MB
|
||||
);
|
||||
|
||||
let documents_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
let docid_word_positions_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
|
||||
let mut config = AnalyzerConfig::default();
|
||||
if let Some(stop_words) = stop_words {
|
||||
config.stop_words(stop_words);
|
||||
}
|
||||
let analyzer = Analyzer::new(config);
|
||||
|
||||
Ok(Store {
|
||||
// Indexing parameters.
|
||||
searchable_fields,
|
||||
filterable_fields,
|
||||
// Caches
|
||||
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||
field_id_word_count_docids: HashMap::new(),
|
||||
word_docids_limit: linked_hash_map_size,
|
||||
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||
words_pairs_proximities_docids_limit: linked_hash_map_size,
|
||||
facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||
facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
|
||||
facet_field_value_docids_limit: linked_hash_map_size,
|
||||
// MTBL parameters
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
chunk_fusing_shrink_size,
|
||||
// MTBL sorters
|
||||
main_sorter,
|
||||
word_docids_sorter,
|
||||
words_pairs_proximities_docids_sorter,
|
||||
word_level_position_docids_sorter,
|
||||
field_id_word_count_docids_sorter,
|
||||
facet_field_numbers_docids_sorter,
|
||||
facet_field_strings_docids_sorter,
|
||||
field_id_docid_facet_numbers_sorter,
|
||||
field_id_docid_facet_strings_sorter,
|
||||
// MTBL writers
|
||||
docid_word_positions_writer,
|
||||
documents_writer,
|
||||
// tokenizer
|
||||
analyzer,
|
||||
})
|
||||
}
|
||||
|
||||
// Save the documents ids under the position and word we have seen it.
|
||||
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.word_docids.get_refresh(word.as_bytes()) {
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
let word_vec = SmallVec32::from(word.as_bytes());
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id)));
|
||||
// If the word docids just reached it's capacity we must make sure to remove
|
||||
// one element, this way next time we insert we doesn't grow the capacity.
|
||||
if self.word_docids.len() == self.word_docids_limit {
|
||||
// Removing the front element is equivalent to removing the LRU element.
|
||||
let lru = self.word_docids.pop_front();
|
||||
Self::write_word_docids(&mut self.word_docids_sorter, lru)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_facet_number_values_docid(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
value: OrderedFloat<f64>,
|
||||
id: DocumentId,
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
|
||||
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
|
||||
|
||||
let key = (field_id, value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_number_docids.get_refresh(&key) {
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
// If the word docids just reached it's capacity we must make sure to remove
|
||||
// one element, this way next time we insert we doesn't grow the capacity.
|
||||
if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit {
|
||||
// Removing the front element is equivalent to removing the LRU element.
|
||||
Self::write_facet_field_number_docids(
|
||||
&mut self.facet_field_numbers_docids_sorter,
|
||||
self.facet_field_number_docids.pop_front(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Save the documents ids under the facet field id and value we have seen it.
|
||||
fn insert_facet_string_values_docid(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
normalized_value: String,
|
||||
original_value: String,
|
||||
id: DocumentId,
|
||||
) -> Result<()> {
|
||||
if normalized_value.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let sorter = &mut self.field_id_docid_facet_strings_sorter;
|
||||
Self::write_field_id_docid_facet_string_value(
|
||||
sorter,
|
||||
field_id,
|
||||
id,
|
||||
&normalized_value,
|
||||
&original_value,
|
||||
)?;
|
||||
|
||||
let key = (field_id, normalized_value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_string_docids.get_refresh(&key) {
|
||||
Some((_original_value, old)) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_string_docids
|
||||
.insert(key, (original_value, RoaringBitmap::from_iter(Some(id))));
|
||||
// If the word docids just reached it's capacity we must make sure to remove
|
||||
// one element, this way next time we insert we doesn't grow the capacity.
|
||||
if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit {
|
||||
// Removing the front element is equivalent to removing the LRU element.
|
||||
Self::write_facet_field_string_docids(
|
||||
&mut self.facet_field_strings_docids_sorter,
|
||||
self.facet_field_string_docids.pop_front(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Save the documents ids under the words pairs proximities that it contains.
|
||||
fn insert_words_pairs_proximities_docids<'a>(
|
||||
&mut self,
|
||||
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
|
||||
id: DocumentId,
|
||||
) -> Result<()> {
|
||||
for ((w1, w2), prox) in words_pairs_proximities {
|
||||
let w1 = SmallVec32::from(w1.as_bytes());
|
||||
let w2 = SmallVec32::from(w2.as_bytes());
|
||||
let key = (w1, w2, prox);
|
||||
// if get_refresh finds the element it is assured
|
||||
// to be at the end of the linked hash map.
|
||||
match self.words_pairs_proximities_docids.get_refresh(&key) {
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
let ids = RoaringBitmap::from_iter(Some(id));
|
||||
self.words_pairs_proximities_docids.insert(key, ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the linked hashmap is over capacity we must remove the overflowing elements.
|
||||
let len = self.words_pairs_proximities_docids.len();
|
||||
let overflow = len.checked_sub(self.words_pairs_proximities_docids_limit);
|
||||
if let Some(overflow) = overflow {
|
||||
let mut lrus = Vec::with_capacity(overflow);
|
||||
// Removing front elements is equivalent to removing the LRUs.
|
||||
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
|
||||
iter.take(overflow).for_each(|x| lrus.push(x));
|
||||
Self::write_words_pairs_proximities(
|
||||
&mut self.words_pairs_proximities_docids_sorter,
|
||||
lrus,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_document(
|
||||
&mut self,
|
||||
document_id: DocumentId,
|
||||
words_positions: &mut HashMap<String, SmallVec32<Position>>,
|
||||
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
|
||||
facet_strings_values: &mut HashMap<FieldId, Vec<(String, String)>>,
|
||||
record: &[u8],
|
||||
) -> Result<()> {
|
||||
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
|
||||
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
|
||||
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
|
||||
|
||||
// We store document_id associated with all the words the record contains.
|
||||
for (word, _) in words_positions.iter() {
|
||||
self.insert_word_docid(word, document_id)?;
|
||||
}
|
||||
|
||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||
Self::write_docid_word_positions(
|
||||
&mut self.docid_word_positions_writer,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
Self::write_word_position_docids(
|
||||
&mut self.word_level_position_docids_sorter,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
|
||||
words_positions.clear();
|
||||
|
||||
// We store document_id associated with all the facet numbers fields ids and values.
|
||||
for (field, values) in facet_numbers_values.drain() {
|
||||
for value in values {
|
||||
let value = OrderedFloat::from(value);
|
||||
self.insert_facet_number_values_docid(field, value, document_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We store document_id associated with all the facet strings fields ids and values.
|
||||
for (field, values) in facet_strings_values.drain() {
|
||||
for (normalized, original) in values {
|
||||
self.insert_facet_string_values_docid(field, normalized, original, document_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_words_pairs_proximities<E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key = Vec::new();
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
for ((w1, w2, min_prox), docids) in iter {
|
||||
key.clear();
|
||||
key.extend_from_slice(w1.as_bytes());
|
||||
key.push(0);
|
||||
key.extend_from_slice(w2.as_bytes());
|
||||
// Storing the minimun proximity found between those words
|
||||
key.push(min_prox);
|
||||
// We serialize the document ids into a buffer
|
||||
buffer.clear();
|
||||
buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids));
|
||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer);
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key) {
|
||||
sorter.insert(&key, &buffer)?;
|
||||
} else {
|
||||
warn!(
|
||||
"words pairs proximity ({:?} - {:?}, {:?}) is too large to be saved",
|
||||
w1, w2, min_prox
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_docid_word_positions(
|
||||
writer: &mut Writer<File>,
|
||||
id: DocumentId,
|
||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> Result<()> {
|
||||
// We prefix the words by the document id.
|
||||
let mut key = id.to_be_bytes().to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
let base_size = key.len();
|
||||
|
||||
// We order the words lexicographically, this way we avoid passing by a sorter.
|
||||
let words_positions = BTreeMap::from_iter(words_positions);
|
||||
|
||||
for (word, positions) in words_positions {
|
||||
key.truncate(base_size);
|
||||
key.extend_from_slice(word.as_bytes());
|
||||
buffer.clear();
|
||||
|
||||
// We serialize the positions into a buffer.
|
||||
let positions = RoaringBitmap::from_iter(positions.iter().cloned());
|
||||
BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer);
|
||||
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key) {
|
||||
writer.insert(&key, &buffer)?;
|
||||
} else {
|
||||
warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_word_position_docids<E>(
|
||||
writer: &mut Sorter<MergeFn<E>>,
|
||||
document_id: DocumentId,
|
||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut data_buffer = Vec::new();
|
||||
|
||||
for (word, positions) in words_positions {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word.as_bytes());
|
||||
key_buffer.push(0); // level 0
|
||||
|
||||
for position in positions {
|
||||
key_buffer.truncate(word.len() + 1);
|
||||
let position_bytes = position.to_be_bytes();
|
||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||
key_buffer.extend_from_slice(position_bytes.as_bytes());
|
||||
|
||||
data_buffer.clear();
|
||||
let positions = RoaringBitmap::from_iter(Some(document_id));
|
||||
// We serialize the positions into a buffer.
|
||||
CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer);
|
||||
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key_buffer) {
|
||||
writer.insert(&key_buffer, &data_buffer)?;
|
||||
} else {
|
||||
warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = ((FieldId, String), (String, RoaringBitmap))>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
|
||||
for ((field_id, normalized_value), (original_value, docids)) in iter {
|
||||
key_buffer.clear();
|
||||
|
||||
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
|
||||
|
||||
let data = (original_value.as_str(), docids);
|
||||
let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
|
||||
.ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?;
|
||||
|
||||
if lmdb_key_valid_size(&key_buffer) {
|
||||
sorter.insert(&key_buffer, &data)?;
|
||||
} else {
|
||||
warn!(
|
||||
"facet value {:?} is too large to be saved",
|
||||
original_value.as_bytes().as_bstr()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut data_buffer = Vec::new();
|
||||
|
||||
for ((field_id, value), docids) in iter {
|
||||
data_buffer.clear();
|
||||
|
||||
let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value))
|
||||
.map(Cow::into_owned)
|
||||
.ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?;
|
||||
|
||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
|
||||
|
||||
if lmdb_key_valid_size(&key) {
|
||||
sorter.insert(&key, &data_buffer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_field_id_docid_facet_number_value<E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
value: OrderedFloat<f64>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value))
|
||||
.map(Cow::into_owned)
|
||||
.ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?;
|
||||
|
||||
if lmdb_key_valid_size(&key) {
|
||||
sorter.insert(&key, &[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_field_id_docid_facet_string_value<E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
field_id: FieldId,
|
||||
document_id: DocumentId,
|
||||
normalized_value: &str,
|
||||
original_value: &str,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
FieldDocIdFacetStringCodec::serialize_into(
|
||||
field_id,
|
||||
document_id,
|
||||
normalized_value,
|
||||
&mut buffer,
|
||||
);
|
||||
|
||||
if lmdb_key_valid_size(&buffer) {
|
||||
sorter.insert(&buffer, original_value.as_bytes())?;
|
||||
} else {
|
||||
warn!("facet value {:?} is too large to be saved", original_value.as_bytes().as_bstr());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key = Vec::new();
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
for (word, ids) in iter {
|
||||
key.clear();
|
||||
key.extend_from_slice(&word);
|
||||
// We serialize the document ids into a buffer
|
||||
buffer.clear();
|
||||
let ids = RoaringBitmap::from_iter(ids);
|
||||
buffer.reserve(ids.serialized_size());
|
||||
ids.serialize_into(&mut buffer)?;
|
||||
// that we write under the generated key into MTBL
|
||||
if lmdb_key_valid_size(&key) {
|
||||
sorter.insert(&key, &buffer)?;
|
||||
} else {
|
||||
warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn index<F>(
|
||||
mut self,
|
||||
mut documents: grenad::Reader<&[u8]>,
|
||||
documents_count: usize,
|
||||
thread_index: usize,
|
||||
num_threads: usize,
|
||||
log_every_n: Option<usize>,
|
||||
mut progress_callback: F,
|
||||
) -> Result<Readers>
|
||||
where
|
||||
F: FnMut(UpdateIndexingStep),
|
||||
{
|
||||
debug!("{:?}: Indexing in a Store...", thread_index);
|
||||
|
||||
let mut before = Instant::now();
|
||||
let mut words_positions = HashMap::new();
|
||||
let mut facet_numbers_values = HashMap::new();
|
||||
let mut facet_strings_values = HashMap::new();
|
||||
|
||||
let mut count: usize = 0;
|
||||
while let Some((key, value)) = documents.next()? {
|
||||
let document_id = key.try_into().map(u32::from_be_bytes).unwrap();
|
||||
let document = obkv::KvReader::new(value);
|
||||
|
||||
// We skip documents that must not be indexed by this thread.
|
||||
if count % num_threads == thread_index {
|
||||
// This is a log routine that we do every `log_every_n` documents.
|
||||
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
|
||||
info!(
|
||||
"We have seen {} documents so far ({:.02?}).",
|
||||
format_count(count),
|
||||
before.elapsed()
|
||||
);
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
before = Instant::now();
|
||||
}
|
||||
|
||||
for (attr, content) in document.iter() {
|
||||
if self.filterable_fields.contains(&attr)
|
||||
|| self.searchable_fields.contains(&attr)
|
||||
{
|
||||
let value =
|
||||
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
if self.filterable_fields.contains(&attr) {
|
||||
let (facet_numbers, facet_strings) = extract_facet_values(&value);
|
||||
facet_numbers_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_numbers);
|
||||
facet_strings_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_strings);
|
||||
}
|
||||
|
||||
if self.searchable_fields.contains(&attr) {
|
||||
let content = match json_to_string(&value) {
|
||||
Some(content) => content,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let analyzed = self.analyzer.analyze(&content);
|
||||
let tokens = process_tokens(analyzed.tokens());
|
||||
|
||||
let mut last_pos = None;
|
||||
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||
last_pos = Some(pos);
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions
|
||||
.entry(token.text().to_string())
|
||||
.or_insert_with(SmallVec32::new)
|
||||
.push(position);
|
||||
}
|
||||
|
||||
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
|
||||
let key = (attr, last_pos as u8 + 1);
|
||||
self.field_id_word_count_docids
|
||||
.entry(key)
|
||||
.or_insert_with(RoaringBitmap::new)
|
||||
.insert(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We write the document in the documents store.
|
||||
self.write_document(
|
||||
document_id,
|
||||
&mut words_positions,
|
||||
&mut facet_numbers_values,
|
||||
&mut facet_strings_values,
|
||||
value,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Compute the document id of the next document.
|
||||
count += 1;
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
|
||||
let readers = self.finish()?;
|
||||
debug!("{:?}: Store created!", thread_index);
|
||||
Ok(readers)
|
||||
}
|
||||
|
||||
fn finish(mut self) -> Result<Readers> {
|
||||
let comp_type = self.chunk_compression_type;
|
||||
let comp_level = self.chunk_compression_level;
|
||||
let shrink_size = self.chunk_fusing_shrink_size;
|
||||
|
||||
Self::write_word_docids(&mut self.word_docids_sorter, self.word_docids)?;
|
||||
Self::write_words_pairs_proximities(
|
||||
&mut self.words_pairs_proximities_docids_sorter,
|
||||
self.words_pairs_proximities_docids,
|
||||
)?;
|
||||
Self::write_facet_field_number_docids(
|
||||
&mut self.facet_field_numbers_docids_sorter,
|
||||
self.facet_field_number_docids,
|
||||
)?;
|
||||
|
||||
Self::write_facet_field_string_docids(
|
||||
&mut self.facet_field_strings_docids_sorter,
|
||||
self.facet_field_string_docids,
|
||||
)?;
|
||||
|
||||
let mut word_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
let mut iter = self.word_docids_sorter.into_iter()?;
|
||||
while let Some((word, val)) = iter.next()? {
|
||||
// This is a lexicographically ordered word position
|
||||
// we use the key to construct the words fst.
|
||||
builder.insert(word)?;
|
||||
word_docids_wtr.insert(word, val)?;
|
||||
}
|
||||
|
||||
let mut docids_buffer = Vec::new();
|
||||
for ((fid, count), docids) in self.field_id_word_count_docids {
|
||||
docids_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer);
|
||||
let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]);
|
||||
self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?;
|
||||
}
|
||||
|
||||
let fst = builder.into_set();
|
||||
self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?;
|
||||
|
||||
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.main_sorter.write_into(&mut main_wtr)?;
|
||||
|
||||
let mut words_pairs_proximities_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter
|
||||
.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
|
||||
let mut word_level_position_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
||||
|
||||
let mut field_id_word_count_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
|
||||
|
||||
let mut facet_field_numbers_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
|
||||
|
||||
let mut facet_field_strings_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_numbers_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_numbers_sorter
|
||||
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_strings_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_strings_sorter
|
||||
.write_into(&mut field_id_docid_facet_strings_wtr)?;
|
||||
|
||||
let main = writer_into_reader(main_wtr, shrink_size)?;
|
||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||
let words_pairs_proximities_docids =
|
||||
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids =
|
||||
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let field_id_word_count_docids =
|
||||
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||
let facet_field_numbers_docids =
|
||||
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||
let facet_field_strings_docids =
|
||||
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_numbers =
|
||||
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_strings =
|
||||
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
|
||||
let docid_word_positions =
|
||||
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
|
||||
|
||||
Ok(Readers {
|
||||
main,
|
||||
word_docids,
|
||||
docid_word_positions,
|
||||
words_pairs_proximities_docids,
|
||||
word_level_position_docids,
|
||||
field_id_word_count_docids,
|
||||
facet_field_numbers_docids,
|
||||
facet_field_strings_docids,
|
||||
field_id_docid_facet_numbers,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Outputs a list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
///
|
||||
/// This list is used by the engine to calculate the documents containing words that are
|
||||
/// close to each other.
|
||||
fn compute_words_pair_proximities(
|
||||
word_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> HashMap<(&str, &str), u8> {
|
||||
use itertools::Itertools;
|
||||
|
||||
let mut words_pair_proximities = HashMap::new();
|
||||
for ((w1, ps1), (w2, ps2)) in word_positions.iter().cartesian_product(word_positions) {
|
||||
let mut min_prox = None;
|
||||
for (ps1, ps2) in ps1.iter().cartesian_product(ps2) {
|
||||
let prox = crate::proximity::positions_proximity(*ps1, *ps2);
|
||||
let prox = u8::try_from(prox).unwrap();
|
||||
// We don't care about a word that appear at the
|
||||
// same position or too far from the other.
|
||||
if prox >= 1 && prox <= 7 && min_prox.map_or(true, |mp| prox < mp) {
|
||||
min_prox = Some(prox)
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(min_prox) = min_prox {
|
||||
words_pair_proximities.insert((w1.as_str(), w2.as_str()), min_prox);
|
||||
}
|
||||
}
|
||||
|
||||
words_pair_proximities
|
||||
}
|
||||
|
||||
fn format_count(n: usize) -> String {
|
||||
human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64)
|
||||
}
|
||||
|
||||
fn lmdb_key_valid_size(key: &[u8]) -> bool {
|
||||
!key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standart proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
let normalized = original.trim().to_lowercase();
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
}
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
||||
let mut facet_number_values = Vec::new();
|
||||
let mut facet_string_values = Vec::new();
|
||||
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values);
|
||||
|
||||
(facet_number_values, facet_string_values)
|
||||
}
|
@ -11,15 +11,14 @@ use log::info;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use super::merge_function::merge_two_obkvs;
|
||||
use super::{create_sorter, create_writer, IndexDocumentsMethod};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, MergeFn, Result, BEU32,
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn,
|
||||
};
|
||||
use super::IndexDocumentsMethod;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32};
|
||||
|
||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||
|
||||
@ -46,7 +45,6 @@ pub struct Transform<'t, 'i> {
|
||||
pub log_every_n: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub chunk_fusing_shrink_size: Option<u64>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub index_documents_method: IndexDocumentsMethod,
|
||||
@ -149,7 +147,6 @@ impl Transform<'_, '_> {
|
||||
merge_function,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
@ -169,7 +166,7 @@ impl Transform<'_, '_> {
|
||||
}
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
// We prepare the fields ids map with the documents keys.
|
||||
for (key, _value) in &document {
|
||||
@ -209,7 +206,6 @@ impl Transform<'_, '_> {
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
// We validate the document id [a-zA-Z0-9\-_].
|
||||
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
@ -291,7 +287,6 @@ impl Transform<'_, '_> {
|
||||
keep_latest_obkv,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
@ -306,7 +301,7 @@ impl Transform<'_, '_> {
|
||||
let mut record = csv::StringRecord::new();
|
||||
while csv.read_record(&mut record).map_err(UserError::Csv)? {
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
@ -372,9 +367,9 @@ impl Transform<'_, '_> {
|
||||
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
||||
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
||||
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
||||
fn output_from_sorter<F, E>(
|
||||
fn output_from_sorter<F>(
|
||||
self,
|
||||
sorter: grenad::Sorter<MergeFn<E>>,
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
primary_key: String,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
approximate_number_of_documents: usize,
|
||||
@ -383,7 +378,6 @@ impl Transform<'_, '_> {
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
Error: From<E>,
|
||||
{
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
let mut field_distribution = self.index.field_distribution(self.rtxn)?;
|
||||
@ -391,10 +385,15 @@ impl Transform<'_, '_> {
|
||||
|
||||
// Once we have sort and deduplicated the documents we write them into a final file.
|
||||
let mut final_sorter = create_sorter(
|
||||
|_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }),
|
||||
|_id, obkvs| {
|
||||
if obkvs.len() == 1 {
|
||||
Ok(obkvs[0].clone())
|
||||
} else {
|
||||
Err(InternalError::IndexingMergingKeys { process: "documents" }.into())
|
||||
}
|
||||
},
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
@ -405,7 +404,7 @@ impl Transform<'_, '_> {
|
||||
|
||||
// While we write into final file we get or generate the internal documents ids.
|
||||
let mut documents_count = 0;
|
||||
let mut iter = sorter.into_iter()?;
|
||||
let mut iter = sorter.into_merger_iter()?;
|
||||
while let Some((external_id, update_obkv)) = iter.next()? {
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
||||
@ -534,7 +533,7 @@ impl Transform<'_, '_> {
|
||||
let docid = docid.get();
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||
for (id, name) in new_fields_ids_map.iter() {
|
||||
|
282
milli/src/update/index_documents/typed_chunk.rs
Normal file
282
milli/src/update/index_documents/typed_chunk.rs
Normal file
@ -0,0 +1,282 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||
};
|
||||
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
||||
use crate::update::index_documents::helpers::into_clonable_grenad;
|
||||
use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||
Documents(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdWordcountDocids(grenad::Reader<File>),
|
||||
NewDocumentsIds(RoaringBitmap),
|
||||
WordDocids(grenad::Reader<File>),
|
||||
WordLevelPositionDocids(grenad::Reader<File>),
|
||||
WordPairProximityDocids(grenad::Reader<File>),
|
||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||
}
|
||||
|
||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||
/// Return new documents seen.
|
||||
pub(crate) fn write_typed_chunk_into_index(
|
||||
typed_chunk: TypedChunk,
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunk {
|
||||
TypedChunk::DocidWordPositions(docid_word_positions_iter) => {
|
||||
write_entries_into_database(
|
||||
docid_word_positions_iter,
|
||||
&index.docid_word_positions,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, buffer| {
|
||||
// ensure that values are unique and ordered
|
||||
let positions = roaring_bitmap_from_u32s_array(value);
|
||||
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
|
||||
Ok(buffer)
|
||||
},
|
||||
|new_values, db_values, buffer| {
|
||||
let new_values = roaring_bitmap_from_u32s_array(new_values);
|
||||
let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) {
|
||||
Some(db_values) => new_values | db_values,
|
||||
None => new_values, // should not happen
|
||||
};
|
||||
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
}
|
||||
TypedChunk::Documents(mut obkv_documents_iter) => {
|
||||
while let Some((key, value)) = obkv_documents_iter.next()? {
|
||||
index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
fid_word_count_docids_iter,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||
return Ok((documents_ids, is_merged_database))
|
||||
}
|
||||
TypedChunk::WordDocids(word_docids_iter) => {
|
||||
let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?;
|
||||
append_entries_into_database(
|
||||
word_docids_iter.clone(),
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_roaring_bitmaps,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
while let Some((word, _value)) = word_docids_iter.next()? {
|
||||
// This is a lexicographically ordered word position
|
||||
// we use the key to construct the words fst.
|
||||
builder.insert(word)?;
|
||||
}
|
||||
let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
|
||||
let db_fst = index.words_fst(wtxn)?;
|
||||
|
||||
// merge new fst with database fst
|
||||
let union_stream = fst.op().add(db_fst.stream()).union();
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(union_stream)?;
|
||||
let fst = builder.into_set();
|
||||
index.put_words_fst(wtxn, &fst)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_level_position_docids_iter,
|
||||
&index.word_level_position_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
facet_id_f64_docids_iter,
|
||||
&index.facet_id_f64_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => {
|
||||
let index_fid_docid_facet_numbers =
|
||||
index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>();
|
||||
while let Some((key, value)) = fid_docid_facet_number.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
index_fid_docid_facet_numbers.put(wtxn, key, &value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetStrings(mut fid_docid_facet_string) => {
|
||||
let index_fid_docid_facet_strings =
|
||||
index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>();
|
||||
while let Some((key, value)) = fid_docid_facet_string.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
index_fid_docid_facet_strings.put(wtxn, key, &value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_string_docids,
|
||||
&index.facet_id_string_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
|value, _buffer| Ok(value),
|
||||
|new_values, db_values, buffer| {
|
||||
let (_, new_values) = decode_prefix_string(new_values).unwrap();
|
||||
let new_values = RoaringBitmap::deserialize_from(new_values)?;
|
||||
let (db_original, db_values) = decode_prefix_string(db_values).unwrap();
|
||||
let db_values = RoaringBitmap::deserialize_from(db_values)?;
|
||||
let values = new_values | db_values;
|
||||
encode_prefix_string(db_original, buffer)?;
|
||||
Ok(values.serialize_into(buffer)?)
|
||||
},
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((RoaringBitmap::new(), is_merged_database))
|
||||
}
|
||||
|
||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
||||
let value = new_value | db_value;
|
||||
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
||||
}
|
||||
|
||||
fn merge_cbo_roaring_bitmaps(
|
||||
new_value: &[u8],
|
||||
db_value: &[u8],
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
Ok(CboRoaringBitmapCodec::merge_into(
|
||||
&[Cow::Borrowed(db_value), Cow::Borrowed(new_value)],
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
mut data: grenad::Reader<R>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: std::io::Read,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
while let Some((key, value)) = data.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
serialize_value(value, &mut buffer)?
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
merge_values(value, prev_value, &mut buffer)?;
|
||||
&buffer[..]
|
||||
}
|
||||
None => serialize_value(value, &mut buffer)?,
|
||||
}
|
||||
};
|
||||
database.put(wtxn, key, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
/// All provided entries must be ordered.
|
||||
/// If the index is not empty, write_entries_into_database is called instead.
|
||||
fn append_entries_into_database<R, K, V, FS, FM>(
|
||||
mut data: grenad::Reader<R>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: std::io::Read,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||
{
|
||||
if !index_is_empty {
|
||||
return write_entries_into_database(
|
||||
data,
|
||||
database,
|
||||
wtxn,
|
||||
false,
|
||||
serialize_value,
|
||||
merge_values,
|
||||
);
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
while let Some((key, value)) = data.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = serialize_value(value, &mut buffer)?;
|
||||
unsafe { database.append(key, value)? };
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -65,10 +65,9 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
||||
pub(crate) log_every_n: Option<usize>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
pub(crate) linked_hash_map_size: Option<usize>,
|
||||
pub(crate) documents_chunk_size: Option<usize>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||
update_id: u64,
|
||||
|
||||
@ -95,10 +94,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
log_every_n: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
linked_hash_map_size: None,
|
||||
documents_chunk_size: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
thread_pool: None,
|
||||
searchable_fields: Setting::NotSet,
|
||||
displayed_fields: Setting::NotSet,
|
||||
@ -205,7 +203,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
log_every_n: self.log_every_n,
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
||||
max_nb_chunks: self.max_nb_chunks,
|
||||
max_memory: self.max_memory,
|
||||
index_documents_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
@ -232,10 +229,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
indexing_builder.log_every_n = self.log_every_n;
|
||||
indexing_builder.max_nb_chunks = self.max_nb_chunks;
|
||||
indexing_builder.max_memory = self.max_memory;
|
||||
indexing_builder.linked_hash_map_size = self.linked_hash_map_size;
|
||||
indexing_builder.documents_chunk_size = self.documents_chunk_size;
|
||||
indexing_builder.chunk_compression_type = self.chunk_compression_type;
|
||||
indexing_builder.chunk_compression_level = self.chunk_compression_level;
|
||||
indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
indexing_builder.thread_pool = self.thread_pool;
|
||||
indexing_builder.execute_raw(output, &cb)?;
|
||||
|
||||
|
@ -7,11 +7,10 @@ use crate::{Index, Result};
|
||||
pub struct UpdateBuilder<'a> {
|
||||
pub(crate) log_every_n: Option<usize>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) documents_chunk_size: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
pub(crate) linked_hash_map_size: Option<usize>,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||
pub(crate) update_id: u64,
|
||||
}
|
||||
@ -21,11 +20,10 @@ impl<'a> UpdateBuilder<'a> {
|
||||
UpdateBuilder {
|
||||
log_every_n: None,
|
||||
max_nb_chunks: None,
|
||||
documents_chunk_size: None,
|
||||
max_memory: None,
|
||||
linked_hash_map_size: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
thread_pool: None,
|
||||
update_id,
|
||||
}
|
||||
@ -43,8 +41,8 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self.max_memory = Some(max_memory);
|
||||
}
|
||||
|
||||
pub fn linked_hash_map_size(&mut self, linked_hash_map_size: usize) {
|
||||
self.linked_hash_map_size = Some(linked_hash_map_size);
|
||||
pub fn documents_chunk_size(&mut self, documents_chunk_size: usize) {
|
||||
self.documents_chunk_size = Some(documents_chunk_size);
|
||||
}
|
||||
|
||||
pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) {
|
||||
@ -55,10 +53,6 @@ impl<'a> UpdateBuilder<'a> {
|
||||
self.chunk_compression_level = Some(chunk_compression_level);
|
||||
}
|
||||
|
||||
pub fn chunk_fusing_shrink_size(&mut self, chunk_fusing_shrink_size: u64) {
|
||||
self.chunk_fusing_shrink_size = Some(chunk_fusing_shrink_size);
|
||||
}
|
||||
|
||||
pub fn thread_pool(&mut self, thread_pool: &'a ThreadPool) {
|
||||
self.thread_pool = Some(thread_pool);
|
||||
}
|
||||
@ -89,10 +83,9 @@ impl<'a> UpdateBuilder<'a> {
|
||||
builder.log_every_n = self.log_every_n;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.linked_hash_map_size = self.linked_hash_map_size;
|
||||
builder.documents_chunk_size = self.documents_chunk_size;
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.thread_pool = self.thread_pool;
|
||||
|
||||
builder
|
||||
@ -108,10 +101,9 @@ impl<'a> UpdateBuilder<'a> {
|
||||
builder.log_every_n = self.log_every_n;
|
||||
builder.max_nb_chunks = self.max_nb_chunks;
|
||||
builder.max_memory = self.max_memory;
|
||||
builder.linked_hash_map_size = self.linked_hash_map_size;
|
||||
builder.documents_chunk_size = self.documents_chunk_size;
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
builder.thread_pool = self.thread_pool;
|
||||
|
||||
builder
|
||||
@ -126,7 +118,6 @@ impl<'a> UpdateBuilder<'a> {
|
||||
|
||||
builder.chunk_compression_type = self.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.chunk_compression_level;
|
||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||
|
||||
builder
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod,
|
||||
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
|
||||
@ -14,7 +14,6 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
}
|
||||
@ -29,12 +28,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixDocids::{}")]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
// Clear the word prefix docids database.
|
||||
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||
@ -44,10 +43,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
roaring_bitmap_merge,
|
||||
merge_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
@ -70,7 +68,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_docids.as_polymorph(),
|
||||
prefix_docids_sorter,
|
||||
roaring_bitmap_merge,
|
||||
merge_roaring_bitmaps,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
|
@ -1,15 +1,13 @@
|
||||
use std::str;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use fst::IntoStreamer;
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesEncode;
|
||||
use log::debug;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::heed_codec::StrStrU8Codec;
|
||||
use crate::update::index_documents::{
|
||||
cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, MergeFn, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result};
|
||||
|
||||
@ -18,9 +16,9 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
threshold: u32,
|
||||
}
|
||||
|
||||
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
@ -33,55 +31,123 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
threshold: 100,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the number of words required to make a prefix be part of the words prefixes
|
||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
||||
///
|
||||
/// Default value is 100. This value must be higher than 50 and will be clamped
|
||||
/// to these bound otherwise.
|
||||
pub fn threshold(&mut self, value: u32) -> &mut Self {
|
||||
self.threshold = value.max(50);
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
|
||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
|
||||
// Here we create a sorter akin to the previous one.
|
||||
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
// We insert all the word pairs corresponding to the word-prefix pairs
|
||||
// where the prefixes appears in the prefix FST previously constructed.
|
||||
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
|
||||
for result in db.iter(self.wtxn)? {
|
||||
let ((word1, word2, prox), data) = result?;
|
||||
let automaton = Str::new(word2).starts_with();
|
||||
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||
while let Some(prefix) = matching_prefixes.next() {
|
||||
let prefix = str::from_utf8(prefix)?;
|
||||
let pair = (word1, prefix, prox);
|
||||
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
|
||||
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
|
||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||
let prefix_fst_keys = prefix_fst.into_stream().into_bytes();
|
||||
let prefix_fst_keys: Vec<_> = prefix_fst_keys
|
||||
.as_slice()
|
||||
.linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap())
|
||||
.collect();
|
||||
|
||||
let mut db =
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut current_prefixes: Option<&&[Vec<u8>]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes),
|
||||
_otherwise => {
|
||||
write_prefixes_in_sorter(
|
||||
&mut prefixes_cache,
|
||||
&mut word_prefix_pair_proximity_docids_sorter,
|
||||
self.threshold,
|
||||
)?;
|
||||
prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0]))
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(prefixes) = current_prefixes {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(w1.as_bytes());
|
||||
buffer.push(0);
|
||||
for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) {
|
||||
buffer.truncate(w1.len() + 1);
|
||||
buffer.extend_from_slice(prefix);
|
||||
buffer.push(prox);
|
||||
|
||||
match prefixes_cache.get_mut(&buffer) {
|
||||
Some(value) => value.push(data),
|
||||
None => {
|
||||
prefixes_cache.insert(buffer.clone(), vec![data]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write_prefixes_in_sorter(
|
||||
&mut prefixes_cache,
|
||||
&mut word_prefix_pair_proximity_docids_sorter,
|
||||
self.threshold,
|
||||
)?;
|
||||
|
||||
drop(prefix_fst);
|
||||
drop(db);
|
||||
|
||||
// We finally write the word prefix pair proximity docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
word_prefix_pair_proximity_docids_sorter,
|
||||
cbo_roaring_bitmap_merge,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_prefixes_in_sorter(
|
||||
prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
|
||||
sorter: &mut grenad::Sorter<MergeFn>,
|
||||
min_word_per_prefix: u32,
|
||||
) -> Result<()> {
|
||||
for (i, (key, data_slices)) in prefixes.drain().enumerate() {
|
||||
// if the number of words prefixed by the prefix is higher than the threshold,
|
||||
// we insert it in the sorter.
|
||||
if data_slices.len() > min_word_per_prefix as usize {
|
||||
for data in data_slices {
|
||||
sorter.insert(&key, data)?;
|
||||
}
|
||||
// if the first prefix isn't elligible for insertion,
|
||||
// then the other prefixes can't be elligible.
|
||||
} else if i == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ use std::num::NonZeroU32;
|
||||
use std::{cmp, str};
|
||||
|
||||
use fst::Streamer;
|
||||
use grenad::{CompressionType, FileFuse, Reader, Writer};
|
||||
use grenad::{CompressionType, Reader, Writer};
|
||||
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||
use heed::{BytesEncode, Error};
|
||||
use log::debug;
|
||||
@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError};
|
||||
use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
|
||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||
use crate::update::index_documents::{
|
||||
cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
|
||||
create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database,
|
||||
write_into_lmdb_database, writer_into_reader, WriteMethod,
|
||||
};
|
||||
use crate::{Index, Result, TreeLevel};
|
||||
@ -24,7 +24,6 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
|
||||
index: &'i Index,
|
||||
pub(crate) chunk_compression_type: CompressionType,
|
||||
pub(crate) chunk_compression_level: Option<u32>,
|
||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||
pub(crate) max_nb_chunks: Option<usize>,
|
||||
pub(crate) max_memory: Option<usize>,
|
||||
level_group_size: NonZeroU32,
|
||||
@ -41,7 +40,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
index,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
chunk_fusing_shrink_size: None,
|
||||
max_nb_chunks: None,
|
||||
max_memory: None,
|
||||
level_group_size: NonZeroU32::new(4).unwrap(),
|
||||
@ -59,6 +57,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordsLevelPositions::{}")]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
|
||||
|
||||
@ -68,7 +67,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.index.word_level_position_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
)?;
|
||||
@ -81,7 +79,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.word_level_position_docids.as_polymorph(),
|
||||
entries,
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }),
|
||||
|_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
@ -89,10 +87,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||
|
||||
let mut word_prefix_level_positions_docids_sorter = create_sorter(
|
||||
cbo_roaring_bitmap_merge,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.max_nb_chunks,
|
||||
self.max_memory,
|
||||
);
|
||||
@ -131,7 +128,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.wtxn,
|
||||
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||
word_prefix_level_positions_docids_sorter,
|
||||
cbo_roaring_bitmap_merge,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
|
||||
@ -141,7 +138,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
self.index.word_prefix_level_position_docids,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.chunk_fusing_shrink_size,
|
||||
self.level_group_size,
|
||||
self.min_level_size,
|
||||
)?;
|
||||
@ -155,7 +151,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||
entries,
|
||||
|_, _| {
|
||||
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })
|
||||
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })?
|
||||
},
|
||||
WriteMethod::Append,
|
||||
)?;
|
||||
@ -185,10 +181,9 @@ fn compute_positions_levels(
|
||||
words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||
compression_type: CompressionType,
|
||||
compression_level: Option<u32>,
|
||||
shrink_size: Option<u64>,
|
||||
level_group_size: NonZeroU32,
|
||||
min_level_size: NonZeroU32,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
) -> Result<Reader<File>> {
|
||||
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
|
||||
// therefore we write the facet levels entries into a grenad file before transfering them.
|
||||
let mut writer = tempfile::tempfile()
|
||||
@ -254,7 +249,7 @@ fn compute_positions_levels(
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer, shrink_size)
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
fn write_level_entry(
|
||||
|
@ -8,7 +8,7 @@ use crate::{Index, Result, SmallString32};
|
||||
pub struct WordsPrefixesFst<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
threshold: f64,
|
||||
threshold: u32,
|
||||
max_prefix_length: usize,
|
||||
_update_id: u64,
|
||||
}
|
||||
@ -22,20 +22,20 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
WordsPrefixesFst {
|
||||
wtxn,
|
||||
index,
|
||||
threshold: 0.1 / 100.0, // .01%
|
||||
threshold: 100,
|
||||
max_prefix_length: 4,
|
||||
_update_id: update_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
|
||||
/// Set the number of words required to make a prefix be part of the words prefixes
|
||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
||||
///
|
||||
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
|
||||
/// to these bounds otherwise.
|
||||
pub fn threshold(&mut self, value: f64) -> &mut Self {
|
||||
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
|
||||
/// Default value is 100. This value must be higher than 50 and will be clamped
|
||||
/// to this bound otherwise.
|
||||
pub fn threshold(&mut self, value: u32) -> &mut Self {
|
||||
self.threshold = value.max(50);
|
||||
self
|
||||
}
|
||||
|
||||
@ -48,10 +48,9 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
self
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordsPrefixesFst::{}")]
|
||||
pub fn execute(self) -> Result<()> {
|
||||
let words_fst = self.index.words_fst(&self.wtxn)?;
|
||||
let number_of_words = words_fst.len();
|
||||
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
|
||||
|
||||
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
||||
for n in 1..=self.max_prefix_length {
|
||||
@ -80,7 +79,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||
current_prefix_count += 1;
|
||||
|
||||
// There is enough words corresponding to this prefix to add it to the cache.
|
||||
if current_prefix_count == min_number_of_words {
|
||||
if current_prefix_count >= self.threshold {
|
||||
builder.insert(prefix)?;
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ use big_s::S;
|
||||
use either::{Either, Left, Right};
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::{hashmap, hashset};
|
||||
use milli::update::{IndexDocuments, Settings, UpdateFormat};
|
||||
use milli::update::{Settings, UpdateBuilder, UpdateFormat};
|
||||
use milli::{AscDesc, Criterion, DocumentId, Index};
|
||||
use serde::Deserialize;
|
||||
use slice_group_by::GroupBy;
|
||||
@ -50,7 +50,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
|
||||
// index documents
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
let mut builder = UpdateBuilder::new(0);
|
||||
builder.max_memory(10 * 1024 * 1024); // 10MiB
|
||||
let mut builder = builder.index_documents(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::JsonStream);
|
||||
builder.enable_autogenerate_docids();
|
||||
builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap();
|
||||
|
Loading…
x
Reference in New Issue
Block a user