mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Normalize for the search the facets values
This commit is contained in:
parent
3070a20580
commit
df528b41d8
27
milli/src/heed_codec/beu16_str_codec.rs
Normal file
27
milli/src/heed_codec/beu16_str_codec.rs
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
pub struct BEU16StrCodec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
|
||||||
|
type DItem = (u16, &'a str);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (n_bytes, str_bytes) = bytes.split_at(2);
|
||||||
|
let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||||
|
let s = str::from_utf8(str_bytes).ok()?;
|
||||||
|
Some((n, s))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
|
||||||
|
type EItem = (u16, &'a str);
|
||||||
|
|
||||||
|
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut bytes = Vec::with_capacity(s.len() + 2);
|
||||||
|
bytes.extend_from_slice(&n.to_be_bytes());
|
||||||
|
bytes.extend_from_slice(s.as_bytes());
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,4 @@
|
|||||||
|
mod beu16_str_codec;
|
||||||
mod beu32_str_codec;
|
mod beu32_str_codec;
|
||||||
mod byte_slice_ref;
|
mod byte_slice_ref;
|
||||||
pub mod facet;
|
pub mod facet;
|
||||||
@ -14,6 +15,7 @@ mod str_str_u8_codec;
|
|||||||
pub use byte_slice_ref::ByteSliceRefCodec;
|
pub use byte_slice_ref::ByteSliceRefCodec;
|
||||||
pub use str_ref::StrRefCodec;
|
pub use str_ref::StrRefCodec;
|
||||||
|
|
||||||
|
pub use self::beu16_str_codec::BEU16StrCodec;
|
||||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||||
pub use self::fst_set_codec::FstSetCodec;
|
pub use self::fst_set_codec::FstSetCodec;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@ -21,7 +21,9 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
FieldIdCodec, OrderedF64Codec,
|
FieldIdCodec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
use crate::heed_codec::{
|
||||||
|
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
|
||||||
|
};
|
||||||
use crate::readable_slices::ReadableSlices;
|
use crate::readable_slices::ReadableSlices;
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
@ -96,6 +98,7 @@ pub mod db_name {
|
|||||||
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
||||||
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
||||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||||
|
pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings";
|
||||||
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
|
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
|
||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
@ -157,6 +160,8 @@ pub struct Index {
|
|||||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||||
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
|
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
|
||||||
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
|
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
|
||||||
|
/// Maps the facet field id of the normalized-for-search string facets with their original versions.
|
||||||
|
pub facet_id_normalized_string_strings: Database<BEU16StrCodec, SerdeJson<BTreeSet<String>>>,
|
||||||
/// Maps the facet field id of the string facets with an FST containing all the facets values.
|
/// Maps the facet field id of the string facets with an FST containing all the facets values.
|
||||||
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
|
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
|
||||||
|
|
||||||
@ -181,7 +186,7 @@ impl Index {
|
|||||||
) -> Result<Index> {
|
) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(24);
|
options.max_dbs(25);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -211,6 +216,8 @@ impl Index {
|
|||||||
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
|
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
|
||||||
let facet_id_string_docids =
|
let facet_id_string_docids =
|
||||||
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
|
||||||
|
let facet_id_normalized_string_strings =
|
||||||
|
env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?;
|
||||||
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
|
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
|
||||||
let facet_id_exists_docids =
|
let facet_id_exists_docids =
|
||||||
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||||
@ -246,6 +253,7 @@ impl Index {
|
|||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_normalized_string_strings,
|
||||||
facet_id_string_fst,
|
facet_id_string_fst,
|
||||||
facet_id_exists_docids,
|
facet_id_exists_docids,
|
||||||
facet_id_is_null_docids,
|
facet_id_is_null_docids,
|
||||||
|
@ -51,9 +51,10 @@ pub use self::error::{
|
|||||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||||
pub use self::fields_ids_map::FieldsIdsMap;
|
pub use self::fields_ids_map::FieldsIdsMap;
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
|
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||||
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
||||||
RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec,
|
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
|
||||||
|
UncheckedU8StrStrCodec,
|
||||||
};
|
};
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
pub use self::search::{
|
pub use self::search::{
|
||||||
|
@ -339,11 +339,12 @@ impl<'a> SearchForFacetValues<'a> {
|
|||||||
|
|
||||||
let mut stream = fst.search(automaton).into_stream();
|
let mut stream = fst.search(automaton).into_stream();
|
||||||
let mut length = 0;
|
let mut length = 0;
|
||||||
while let Some(facet_value) = stream.next() {
|
'outer: while let Some(facet_value) = stream.next() {
|
||||||
let value = std::str::from_utf8(facet_value)?;
|
let value = std::str::from_utf8(facet_value)?;
|
||||||
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
|
let database = index.facet_id_normalized_string_strings;
|
||||||
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
|
let key = (fid, value);
|
||||||
Some(FacetGroupValue { bitmap, .. }) => bitmap,
|
let original_strings = match database.get(rtxn, &key)? {
|
||||||
|
Some(original_strings) => original_strings,
|
||||||
None => {
|
None => {
|
||||||
error!(
|
error!(
|
||||||
"the facet value is missing from the facet database: {key:?}"
|
"the facet value is missing from the facet database: {key:?}"
|
||||||
@ -351,16 +352,36 @@ impl<'a> SearchForFacetValues<'a> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let count = search_candidates.intersection_len(&docids);
|
for original_string in original_strings {
|
||||||
if count != 0 {
|
let key = FacetGroupKey {
|
||||||
let value = self
|
field_id: fid,
|
||||||
.one_original_value_of(fid, value, docids.min().unwrap())?
|
level: 0,
|
||||||
.unwrap_or_else(|| query.to_string());
|
left_bound: original_string.as_str(),
|
||||||
results.push(FacetValueHit { value, count });
|
};
|
||||||
length += 1;
|
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
|
||||||
}
|
Some(FacetGroupValue { bitmap, .. }) => bitmap,
|
||||||
if length >= MAX_NUMBER_OF_FACETS {
|
None => {
|
||||||
break;
|
error!(
|
||||||
|
"the facet value is missing from the facet database: {key:?}"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let count = search_candidates.intersection_len(&docids);
|
||||||
|
if count != 0 {
|
||||||
|
let value = self
|
||||||
|
.one_original_value_of(
|
||||||
|
fid,
|
||||||
|
&original_string,
|
||||||
|
docids.min().unwrap(),
|
||||||
|
)?
|
||||||
|
.unwrap_or_else(|| query.to_string());
|
||||||
|
results.push(FacetValueHit { value, count });
|
||||||
|
length += 1;
|
||||||
|
}
|
||||||
|
if length >= MAX_NUMBER_OF_FACETS {
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
script_language_docids,
|
script_language_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_normalized_string_strings,
|
||||||
facet_id_string_fst,
|
facet_id_string_fst,
|
||||||
facet_id_exists_docids,
|
facet_id_exists_docids,
|
||||||
facet_id_is_null_docids,
|
facet_id_is_null_docids,
|
||||||
@ -92,6 +93,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_fid_docids.clear(self.wtxn)?;
|
word_prefix_fid_docids.clear(self.wtxn)?;
|
||||||
script_language_docids.clear(self.wtxn)?;
|
script_language_docids.clear(self.wtxn)?;
|
||||||
facet_id_f64_docids.clear(self.wtxn)?;
|
facet_id_f64_docids.clear(self.wtxn)?;
|
||||||
|
facet_id_normalized_string_strings.clear(self.wtxn)?;
|
||||||
facet_id_string_fst.clear(self.wtxn)?;
|
facet_id_string_fst.clear(self.wtxn)?;
|
||||||
facet_id_exists_docids.clear(self.wtxn)?;
|
facet_id_exists_docids.clear(self.wtxn)?;
|
||||||
facet_id_is_null_docids.clear(self.wtxn)?;
|
facet_id_is_null_docids.clear(self.wtxn)?;
|
||||||
|
@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_fid_docids,
|
word_prefix_fid_docids,
|
||||||
facet_id_f64_docids: _,
|
facet_id_f64_docids: _,
|
||||||
facet_id_string_docids: _,
|
facet_id_string_docids: _,
|
||||||
|
facet_id_normalized_string_strings: _,
|
||||||
facet_id_string_fst: _,
|
facet_id_string_fst: _,
|
||||||
field_id_docid_facet_f64s: _,
|
field_id_docid_facet_f64s: _,
|
||||||
field_id_docid_facet_strings: _,
|
field_id_docid_facet_strings: _,
|
||||||
|
@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
|
|||||||
pub const FACET_GROUP_SIZE: u8 = 4;
|
pub const FACET_GROUP_SIZE: u8 = 4;
|
||||||
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||||
|
|
||||||
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use heed::types::DecodeIgnore;
|
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||||
|
use grenad::{CompressionType, SortAlgorithm};
|
||||||
|
use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
|
||||||
|
use heed::BytesEncode;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
|
|||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::{Index, Result, BEU16};
|
use crate::update::index_documents::create_sorter;
|
||||||
|
use crate::update::merge_btreeset_string;
|
||||||
|
use crate::{BEU16StrCodec, Index, Result, BEU16};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod delete;
|
pub mod delete;
|
||||||
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
incremental_update.execute(wtxn)?;
|
incremental_update.execute(wtxn)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We clear the list of normalized-for-search facets
|
||||||
|
// and the previous FSTs to compute everything from scratch
|
||||||
|
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
||||||
|
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||||
|
|
||||||
|
// As we can't use the same write transaction to read and write in two different databases
|
||||||
|
// we must create a temporary sorter that we will write into LMDB afterward.
|
||||||
|
// As multiple unnormalized facet values can become the same normalized facet value
|
||||||
|
// we must merge them together.
|
||||||
|
let mut sorter = create_sorter(
|
||||||
|
SortAlgorithm::Unstable,
|
||||||
|
merge_btreeset_string,
|
||||||
|
CompressionType::None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
// We iterate on the list of original, semi-normalized, facet values
|
||||||
|
// and normalize them for search, inserting them in LMDB in any given order.
|
||||||
|
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||||
|
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
||||||
|
for result in database.iter(wtxn)? {
|
||||||
|
let (facet_group_key, ()) = result?;
|
||||||
|
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||||
|
let normalized_facet = left_bound.normalize(&options);
|
||||||
|
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||||
|
let key = (field_id, normalized_facet.as_ref());
|
||||||
|
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
|
||||||
|
let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
|
||||||
|
sorter.insert(key, val)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// In this loop we don't need to take care of merging bitmaps
|
||||||
|
// as the grenad sorter already merged them for us.
|
||||||
|
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||||
|
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
|
||||||
|
self.index
|
||||||
|
.facet_id_normalized_string_strings
|
||||||
|
.remap_types::<ByteSlice, ByteSlice>()
|
||||||
|
.put(wtxn, key_bytes, btreeset_bytes)?;
|
||||||
|
}
|
||||||
|
|
||||||
// We compute one FST by string facet
|
// We compute one FST by string facet
|
||||||
let mut text_fsts = vec![];
|
let mut text_fsts = vec![];
|
||||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||||
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
let database =
|
||||||
|
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||||
for result in database.iter(wtxn)? {
|
for result in database.iter(wtxn)? {
|
||||||
let (facet_group_key, _) = result?;
|
let ((field_id, normalized_facet), _) = result?;
|
||||||
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
current_fst = match current_fst.take() {
|
||||||
current_fst = match current_fst.take() {
|
Some((fid, fst_builder)) if fid != field_id => {
|
||||||
Some((fid, fst_builder)) if fid != field_id => {
|
let fst = fst_builder.into_set();
|
||||||
let fst = fst_builder.into_set();
|
text_fsts.push((fid, fst));
|
||||||
text_fsts.push((fid, fst));
|
Some((field_id, fst::SetBuilder::memory()))
|
||||||
Some((field_id, fst::SetBuilder::memory()))
|
|
||||||
}
|
|
||||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
|
||||||
None => Some((field_id, fst::SetBuilder::memory())),
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
|
||||||
fst_builder.insert(left_bound)?;
|
|
||||||
}
|
}
|
||||||
|
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||||
|
None => Some((field_id, fst::SetBuilder::memory())),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||||
|
fst_builder.insert(normalized_facet)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
text_fsts.push((field_id, fst));
|
text_fsts.push((field_id, fst));
|
||||||
}
|
}
|
||||||
|
|
||||||
// We remove all of the previous FSTs that were in this database
|
|
||||||
self.index.facet_id_string_fst.clear(wtxn)?;
|
|
||||||
|
|
||||||
// We write those FSTs in LMDB now
|
// We write those FSTs in LMDB now
|
||||||
for (field_id, fst) in text_fsts {
|
for (field_id, fst) in text_fsts {
|
||||||
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
|
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
|
if values.len() == 1 {
|
||||||
|
Ok(values[0].clone())
|
||||||
|
} else {
|
||||||
|
// TODO improve the perf by using a `#[borrow] Cow<str>`.
|
||||||
|
let strings: BTreeSet<String> = values
|
||||||
|
.iter()
|
||||||
|
.map(AsRef::as_ref)
|
||||||
|
.map(serde_json::from_slice::<BTreeSet<String>>)
|
||||||
|
.map(StdResult::unwrap)
|
||||||
|
.reduce(|mut current, new| {
|
||||||
|
for x in new {
|
||||||
|
current.insert(x);
|
||||||
|
}
|
||||||
|
current
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
Ok(values[0].clone())
|
Ok(values[0].clone())
|
||||||
}
|
}
|
||||||
|
@ -13,9 +13,9 @@ pub use grenad_helpers::{
|
|||||||
GrenadParameters, MergeableReader,
|
GrenadParameters, MergeableReader,
|
||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
|
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
||||||
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
|
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
|
||||||
MergeFn,
|
serialize_roaring_bitmap, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::MAX_WORD_LENGTH;
|
use crate::MAX_WORD_LENGTH;
|
||||||
|
@ -26,7 +26,7 @@ pub use self::enrich::{
|
|||||||
};
|
};
|
||||||
pub use self::helpers::{
|
pub use self::helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
|
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||||
|
@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
|
|||||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||||
pub use self::index_documents::{
|
pub use self::index_documents::{
|
||||||
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
|
merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
|
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||||
|
MergeFn,
|
||||||
};
|
};
|
||||||
pub use self::indexer_config::IndexerConfig;
|
pub use self::indexer_config::IndexerConfig;
|
||||||
pub use self::prefix_word_pairs::{
|
pub use self::prefix_word_pairs::{
|
||||||
|
Loading…
Reference in New Issue
Block a user