4888: bring back v1.10.0 into main r=Kerollmops a=ManyTheFish



Co-authored-by: Louis Dureuil <louis@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-08-27 14:02:08 +00:00 committed by GitHub
commit 9a756cf2c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 2618 additions and 99 deletions

View file

@ -9,7 +9,6 @@ use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap;
use rstar::RTree;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
@ -173,8 +172,8 @@ impl Index {
pub fn new_with_creation_dates<P: AsRef<Path>>(
mut options: heed::EnvOpenOptions,
path: P,
created_at: OffsetDateTime,
updated_at: OffsetDateTime,
created_at: time::OffsetDateTime,
updated_at: time::OffsetDateTime,
) -> Result<Index> {
use db_name::*;
@ -256,22 +255,22 @@ impl Index {
}
pub fn new<P: AsRef<Path>>(options: heed::EnvOpenOptions, path: P) -> Result<Index> {
let now = OffsetDateTime::now_utc();
let now = time::OffsetDateTime::now_utc();
Self::new_with_creation_dates(options, path, now, now)
}
fn set_creation_dates(
env: &heed::Env,
main: Database<Unspecified, Unspecified>,
created_at: OffsetDateTime,
updated_at: OffsetDateTime,
created_at: time::OffsetDateTime,
updated_at: time::OffsetDateTime,
) -> heed::Result<()> {
let mut txn = env.write_txn()?;
// The db was just created, we update its metadata with the relevant information.
let main = main.remap_types::<Str, SerdeJson<OffsetDateTime>>();
if main.get(&txn, main_key::CREATED_AT_KEY)?.is_none() {
main.put(&mut txn, main_key::UPDATED_AT_KEY, &updated_at)?;
main.put(&mut txn, main_key::CREATED_AT_KEY, &created_at)?;
main.put(&mut txn, main_key::UPDATED_AT_KEY, &OffsetDateTime(updated_at))?;
main.put(&mut txn, main_key::CREATED_AT_KEY, &OffsetDateTime(created_at))?;
txn.commit()?;
}
Ok(())
@ -371,7 +370,7 @@ impl Index {
wtxn: &mut RwTxn<'_>,
primary_key: &str,
) -> heed::Result<()> {
self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
self.set_updated_at(wtxn, &time::OffsetDateTime::now_utc())?;
self.main.remap_types::<Str, Str>().put(wtxn, main_key::PRIMARY_KEY_KEY, primary_key)
}
@ -1323,7 +1322,7 @@ impl Index {
}
/// Returns the index creation time.
pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result<OffsetDateTime> {
pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result<time::OffsetDateTime> {
Ok(self
.main
.remap_types::<Str, SerdeJson<OffsetDateTime>>()
@ -1331,11 +1330,12 @@ impl Index {
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
key: Some(main_key::CREATED_AT_KEY),
})?)
})?
.0)
}
/// Returns the index last updated time.
pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result<OffsetDateTime> {
pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result<time::OffsetDateTime> {
Ok(self
.main
.remap_types::<Str, SerdeJson<OffsetDateTime>>()
@ -1343,18 +1343,19 @@ impl Index {
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
key: Some(main_key::UPDATED_AT_KEY),
})?)
})?
.0)
}
pub(crate) fn set_updated_at(
&self,
wtxn: &mut RwTxn<'_>,
time: &OffsetDateTime,
time: &time::OffsetDateTime,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<OffsetDateTime>>().put(
wtxn,
main_key::UPDATED_AT_KEY,
time,
&OffsetDateTime(*time),
)
}
@ -1681,6 +1682,10 @@ pub struct IndexEmbeddingConfig {
pub user_provided: RoaringBitmap,
}
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime);
#[cfg(test)]
pub(crate) mod tests {
use std::collections::HashSet;

View file

@ -90,6 +90,21 @@ impl LocalizedFieldIds {
pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> {
self.field_id_to_locales.get(&fields_id).map(Vec::as_slice)
}
pub fn all_locales(&self) -> Vec<Language> {
let mut locales = Vec::new();
for field_locales in self.field_id_to_locales.values() {
if !field_locales.is_empty() {
locales.extend(field_locales);
} else {
// If a field has no locales, we consider it as not localized
return Vec::new();
}
}
locales.sort();
locales.dedup();
locales
}
}
#[cfg(test)]

View file

@ -339,10 +339,18 @@ impl ValuesCollection {
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language: detection.language(),
language,
..Default::default()
};

View file

@ -360,6 +360,7 @@ mod test {
use super::*;
#[cfg(feature = "japanese")]
#[cfg(not(feature = "chinese-pinyin"))]
#[test]
fn test_kanji_language_detection() {
use crate::index::tests::TempIndex;

View file

@ -110,18 +110,18 @@ impl<'ctx> DatabaseCache<'ctx> {
.map_err(Into::into)
}
fn get_value_from_keys<'v, K1, KC, DC>(
fn get_value_from_keys<'v, K1, KC>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_keys: &'v [KC::EItem],
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
universe: Option<&RoaringBitmap>,
merger: MergeFn,
) -> Result<Option<DC::DItem>>
) -> Result<Option<RoaringBitmap>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
DC: BytesDecodeOwned,
KC::EItem: Sized,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
@ -146,16 +146,22 @@ impl<'ctx> DatabaseCache<'ctx> {
entry.insert(bitmap_ptr);
}
match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
None => Ok(None),
}
}
}
@ -207,12 +213,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -238,12 +245,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -294,12 +302,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -325,12 +334,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}

View file

@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder;
@ -671,9 +672,44 @@ pub fn execute_search(
tokbuilder.words_dict(dictionary);
}
if let Some(locales) = locales {
tokbuilder.allow_list(locales);
}
let db_locales;
match locales {
Some(locales) => {
if !locales.is_empty() {
tokbuilder.allow_list(locales);
}
}
None => {
// If no locales are specified, we use the locales specified in the localized attributes rules
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
let localized_fields = match &ctx.restricted_fids {
// if AttributeToSearchOn is set, use the restricted list of ids
Some(restricted_fids) => {
let iter = restricted_fids
.exact
.iter()
.chain(restricted_fids.tolerant.iter())
.map(|(fid, _)| *fid);
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
}
// Otherwise use the full list of ids coming from the index searchable fields
None => LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields.into_iter(),
),
};
db_locales = localized_fields.all_locales();
if !db_locales.is_empty() {
tokbuilder.allow_list(&db_locales);
}
}
};
let tokenizer = tokbuilder.build();
drop(entered);

View file

@ -6,6 +6,7 @@ pub mod exactness;
pub mod geo_sort;
pub mod integration;
#[cfg(feature = "all-tokenizations")]
#[cfg(not(feature = "chinese-pinyin"))]
pub mod language;
pub mod ngram_split_words;
pub mod proximity;

View file

@ -12,6 +12,7 @@ use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
@ -28,6 +29,116 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
if settings_diff.settings_update_only() {
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
} else {
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
extract_facet_string_docids_document_update(
docid_fid_facet_string,
indexer,
localized_field_ids,
)
}
}
/// Extracts the facet string and the documents ids where this facet string appear.
///
/// Returns a grenad reader with the list of extracted facet strings and
/// documents ids from the given chunk of docid facet string positions.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
localized_field_ids: &LocalizedFieldIds,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut normalized_facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_deladd_btreeset_string,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
&& deladd_reader.get(DelAdd::Addition).is_some();
if is_same_value {
continue;
}
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
{
let locales = localized_field_ids.locales(field_id);
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// as the facet string is the same, we can put the deletion and addition in the same obkv.
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
obkv.insert(deladd_key, val)?;
}
obkv.finish()?;
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
}
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
}
/// Extracts the facet string and the documents ids where this facet string appear.
///
/// Returns a grenad reader with the list of extracted facet strings and
/// documents ids from the given chunk of docid facet string positions.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
@ -60,6 +171,15 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let are_same_locales = old_locales == new_locales;
if is_same_value && are_same_locales {
continue;
}
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
@ -68,15 +188,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
// Facet search normalization
{
let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
let new_hyper_normalized_value = if are_same_locales {
&old_hyper_normalized_value
} else {
&normalize_facet_string(normalized_value, new_locales)
};
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// if the facet string is the same, we can put the deletion and addition in the same obkv.
if old_hyper_normalized_value == new_hyper_normalized_value {
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
// nothing to do if we delete and re-add the value.
if is_same_value {
continue;
@ -148,12 +270,21 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
/// Normalizes the facet string and truncates it to the max length.
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
let script = detection.script();
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language: detection.language(),
script,
language,
..Default::default()
};

View file

@ -9,7 +9,7 @@ use std::result::Result as StdResult;
use bytemuck::bytes_of;
use grenad::Sorter;
use heed::BytesEncode;
use itertools::{merge_join_by, EitherOrBoth};
use itertools::{merge_join_by, EitherOrBoth, Itertools};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::{from_slice, Value};
@ -317,11 +317,15 @@ fn deladd_obkv_cbo_roaring_bitmaps(
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_string(s: String) -> String {
s.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect()
fn truncate_str(s: &str) -> &str {
let index = s
.char_indices()
.map(|(idx, _)| idx)
.chain(std::iter::once(s.len()))
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
.last();
&s[..index.unwrap_or(0)]
}
/// Computes the diff between both Del and Add numbers and
@ -401,36 +405,102 @@ where
del_strings.dedup();
add_strings.dedup();
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
let merged_strings_iter = itertools::merge_join_by(
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|del, add| del.cmp(add),
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
);
// insert normalized and original facet string in sorter
for eob in merged_strings_iter {
key_buffer.truncate(TRUNCATE_SIZE);
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left((normalized, original)) => {
let truncated = truncate_string(normalized);
let (side, normalized, original) = match eob {
EitherOrBoth::Both((normalized, del), (_, add)) => {
let merged_strings_iter =
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
original_del.cmp(original_add)
});
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
// but we possibly have multiple original values that changed in the case where the field is an
// array of multiple values that normalize to the same value.
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
//
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
// value that is only in the old version, and adding the first value that is only in the new version
let mut obkv = KvWriterDelAdd::memory();
let mut del = None;
let mut add = None;
let mut both = None;
for eob in merged_strings_iter {
match eob {
EitherOrBoth::Both((_normalized, original), _) => {
both = match both {
Some(both) => Some(both),
None => Some(original),
}
}
EitherOrBoth::Left((_normalized, original)) => {
del = match del {
Some(del) => Some(del),
None => Some(original),
};
}
EitherOrBoth::Right((_normalized, original)) => {
add = match add {
Some(add) => Some(add),
None => Some(original),
}
}
}
}
if let Some(del) = del {
obkv.insert(DelAdd::Deletion, del)?;
}
if let Some(add) = add
// prefer the newly added, but if there is none, keep a value in the list of values
// since the normalized value appears both in old and new, we should never remove it.
.or(both)
{
obkv.insert(DelAdd::Addition, add)?;
}
let truncated = truncate_str(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
continue;
}
EitherOrBoth::Right((normalized, original)) => {
let truncated = truncate_string(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
EitherOrBoth::Left((_normalized, mut original)) => {
// FIXME: we only consider the first value for the purpose of facet search
// another structure is needed, able to retain all originals associated with a normalized value.
let Some((normalized, original)) = original.next() else {
continue;
};
(DelAdd::Deletion, normalized, original)
}
EitherOrBoth::Right((_normalized, mut original)) => {
// FIXME: we only consider the first value for the purpose of facet search
// another structure is needed, able to retain all originals associated with a normalized value.
let Some((normalized, original)) = original.next() else {
continue;
};
(DelAdd::Addition, normalized, original)
}
};
let truncated = truncate_str(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
}
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(side, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
Ok(())

View file

@ -290,7 +290,7 @@ where
match result? {
DocumentEdition::Deleted(docid) => {
documents_to_remove.push(docid);
documents_to_remove.insert(docid);
}
DocumentEdition::Edited(new_document) => {
documents_batch_builder.append_json_object(&new_document)?;

View file

@ -62,8 +62,18 @@ pub enum EmbedErrorKind {
RestResponseDeserialization(std::io::Error),
#[error("expected a response containing {0} embeddings, got only {1}")]
RestResponseEmbeddingCount(usize, usize),
#[error("could not authenticate against embedding server{}", option_info(.0.as_deref(), "server replied with "))]
RestUnauthorized(Option<String>),
#[error("could not authenticate against {embedding} server{server_reply}{hint}", embedding=match *.1 {
ConfigurationSource::User => "embedding",
ConfigurationSource::OpenAi => "OpenAI",
ConfigurationSource::Ollama => "ollama"
},
server_reply=option_info(.0.as_deref(), "server replied with "),
hint=match *.1 {
ConfigurationSource::User => "\n - Hint: Check the `apiKey` parameter in the embedder configuration",
ConfigurationSource::OpenAi => "\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
ConfigurationSource::Ollama => "\n - Hint: Check the `apiKey` parameter in the embedder configuration"
})]
RestUnauthorized(Option<String>, ConfigurationSource),
#[error("sent too many requests to embedding server{}", option_info(.0.as_deref(), "server replied with "))]
RestTooManyRequests(Option<String>),
#[error("sent a bad request to embedding server{}{}",
@ -136,8 +146,14 @@ impl EmbedError {
}
}
pub(crate) fn rest_unauthorized(error_response: Option<String>) -> EmbedError {
Self { kind: EmbedErrorKind::RestUnauthorized(error_response), fault: FaultSource::User }
pub(crate) fn rest_unauthorized(
error_response: Option<String>,
configuration_source: ConfigurationSource,
) -> EmbedError {
Self {
kind: EmbedErrorKind::RestUnauthorized(error_response, configuration_source),
fault: FaultSource::User,
}
}
pub(crate) fn rest_too_many_requests(error_response: Option<String>) -> EmbedError {

View file

@ -183,7 +183,7 @@ impl Embedder {
let rest_embedder = RestEmbedder::new(
RestEmbedderOptions {
api_key: Some(api_key.clone()),
api_key: (!api_key.is_empty()).then(|| api_key.clone()),
distribution: None,
dimensions: Some(options.dimensions()),
url,

View file

@ -275,7 +275,10 @@ fn check_response(
Err(ureq::Error::Status(code, response)) => {
let error_response: Option<String> = response.into_string().ok();
Err(match code {
401 => Retry::give_up(EmbedError::rest_unauthorized(error_response)),
401 => Retry::give_up(EmbedError::rest_unauthorized(
error_response,
configuration_source,
)),
429 => Retry::rate_limited(EmbedError::rest_too_many_requests(error_response)),
400 => Retry::give_up(EmbedError::rest_bad_request(
error_response,