mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge branch 'main' into merge-release-v1.8.1-in-main
This commit is contained in:
commit
1ab88e10b9
31 changed files with 2101 additions and 194 deletions
|
@ -12,7 +12,10 @@ use bimap::BiHashMap;
|
|||
pub use builder::DocumentsBatchBuilder;
|
||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||
use obkv::KvReader;
|
||||
pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
pub use primary_key::{
|
||||
validate_document_id_value, DocumentIdExtractionError, FieldIdMapper, PrimaryKey,
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ impl<'a> PrimaryKey<'a> {
|
|||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
match validate_document_id_value(document_id) {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
|
@ -88,7 +88,7 @@ impl<'a> PrimaryKey<'a> {
|
|||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Some(document_id) => match validate_document_id_value(document_id) {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
|
@ -159,14 +159,14 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
pub fn validate_document_id_value(document_id: Value) -> StdResult<String, UserError> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
Some(s) if s.len() == string.len() => Ok(string),
|
||||
Some(s) => Ok(s.to_string()),
|
||||
None => Err(UserError::InvalidDocumentId { document_id: Value::String(string) }),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
Value::Number(number) if number.is_i64() => Ok(number.to_string()),
|
||||
content => Err(UserError::InvalidDocumentId { document_id: content }),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1595,6 +1595,22 @@ impl Index {
|
|||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn arroy_readers<'a>(
|
||||
&'a self,
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
embedder_id: u8,
|
||||
) -> impl Iterator<Item = Result<arroy::Reader<arroy::distances::Angular>>> + 'a {
|
||||
crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| {
|
||||
arroy::Reader::open(rtxn, k, self.vector_arroy)
|
||||
.map(Some)
|
||||
.or_else(|e| match e {
|
||||
arroy::Error::MissingMetadata => Ok(None),
|
||||
e => Err(e.into()),
|
||||
})
|
||||
.transpose()
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||
}
|
||||
|
|
|
@ -63,6 +63,7 @@ pub use self::heed_codec::{
|
|||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
|
||||
pub use self::search::similar::Similar;
|
||||
pub use self::search::{
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
|
||||
Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
|
|
|
@ -24,6 +24,7 @@ pub mod facet;
|
|||
mod fst_utils;
|
||||
pub mod hybrid;
|
||||
pub mod new;
|
||||
pub mod similar;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SemanticSearch {
|
||||
|
@ -148,7 +149,7 @@ impl<'a> Search<'a> {
|
|||
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
|
||||
if has_vector_search {
|
||||
let ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
filtered_universe(&ctx, &self.filter)
|
||||
filtered_universe(ctx.index, ctx.txn, &self.filter)
|
||||
} else {
|
||||
Ok(self.execute()?.candidates)
|
||||
}
|
||||
|
@ -161,7 +162,7 @@ impl<'a> Search<'a> {
|
|||
ctx.attributes_to_search_on(searchable_attributes)?;
|
||||
}
|
||||
|
||||
let universe = filtered_universe(&ctx, &self.filter)?;
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
||||
let PartialSearchResult {
|
||||
located_query_terms,
|
||||
candidates,
|
||||
|
|
|
@ -507,7 +507,7 @@ mod tests {
|
|||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn).unwrap();
|
||||
let universe = filtered_universe(&ctx, &None).unwrap();
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
Some(query),
|
||||
|
|
|
@ -543,11 +543,15 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn filtered_universe(ctx: &SearchContext, filters: &Option<Filter>) -> Result<RoaringBitmap> {
|
||||
pub fn filtered_universe(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
filters: &Option<Filter>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
Ok(if let Some(filters) = filters {
|
||||
filters.evaluate(ctx.txn, ctx.index)?
|
||||
filters.evaluate(txn, index)?
|
||||
} else {
|
||||
ctx.index.documents_ids(ctx.txn)?
|
||||
index.documents_ids(txn)?
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -49,19 +49,8 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
|||
ctx: &mut SearchContext<'_>,
|
||||
vector_candidates: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let writer_index = (self.embedder_index as u16) << 8;
|
||||
let readers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
|
||||
.map_while(|k| {
|
||||
arroy::Reader::open(ctx.txn, writer_index | (k as u16), ctx.index.vector_arroy)
|
||||
.map(Some)
|
||||
.or_else(|e| match e {
|
||||
arroy::Error::MissingMetadata => Ok(None),
|
||||
e => Err(e),
|
||||
})
|
||||
.transpose()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let readers: std::result::Result<Vec<_>, _> =
|
||||
ctx.index.arroy_readers(ctx.txn, self.embedder_index).collect();
|
||||
let readers = readers?;
|
||||
|
||||
let target = &self.target;
|
||||
|
|
111
milli/src/search/similar.rs
Normal file
111
milli/src/search/similar.rs
Normal file
|
@ -0,0 +1,111 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
||||
|
||||
pub struct Similar<'a> {
|
||||
id: DocumentId,
|
||||
// this should be linked to the String in the query
|
||||
filter: Option<Filter<'a>>,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
}
|
||||
|
||||
impl<'a> Similar<'a> {
|
||||
pub fn new(
|
||||
id: DocumentId,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
index: &'a Index,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
) -> Self {
|
||||
Self { id, filter: None, offset, limit, rtxn, index, embedder_name, embedder }
|
||||
}
|
||||
|
||||
pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self {
|
||||
self.filter = Some(filter);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let universe = filtered_universe(self.index, self.rtxn, &self.filter)?;
|
||||
|
||||
let embedder_index =
|
||||
self.index
|
||||
.embedder_category_id
|
||||
.get(self.rtxn, &self.embedder_name)?
|
||||
.ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?;
|
||||
|
||||
let readers: std::result::Result<Vec<_>, _> =
|
||||
self.index.arroy_readers(self.rtxn, embedder_index).collect();
|
||||
|
||||
let readers = readers?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for reader in readers.iter() {
|
||||
let nns_by_item = reader.nns_by_item(
|
||||
self.rtxn,
|
||||
self.id,
|
||||
self.limit + self.offset + 1,
|
||||
None,
|
||||
Some(&universe),
|
||||
)?;
|
||||
if let Some(mut nns_by_item) = nns_by_item {
|
||||
results.append(&mut nns_by_item);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||
|
||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||
let mut document_scores = Vec::with_capacity(self.limit);
|
||||
// list of documents we've already seen, so that we don't return the same document multiple times.
|
||||
// initialized to the target document, that we never want to return.
|
||||
let mut documents_seen = RoaringBitmap::new();
|
||||
documents_seen.insert(self.id);
|
||||
|
||||
for (docid, distance) in results
|
||||
.into_iter()
|
||||
// skip documents we've already seen & mark that we saw the current document
|
||||
.filter(|(docid, _)| documents_seen.insert(*docid))
|
||||
.skip(self.offset)
|
||||
// take **after** filter and skip so that we get exactly limit elements if available
|
||||
.take(self.limit)
|
||||
{
|
||||
documents_ids.push(docid);
|
||||
|
||||
let score = 1.0 - distance;
|
||||
let score = self
|
||||
.embedder
|
||||
.distribution()
|
||||
.map(|distribution| distribution.shift(score))
|
||||
.unwrap_or(score);
|
||||
|
||||
let score = ScoreDetails::Vector(score_details::Vector { similarity: Some(score) });
|
||||
|
||||
document_scores.push(vec![score]);
|
||||
}
|
||||
|
||||
Ok(SearchResult {
|
||||
matching_words: Default::default(),
|
||||
candidates: universe,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded: false,
|
||||
used_negative_operator: false,
|
||||
})
|
||||
}
|
||||
}
|
|
@ -538,10 +538,8 @@ where
|
|||
)?;
|
||||
|
||||
pool.install(|| {
|
||||
let writer_index = (embedder_index as u16) << 8;
|
||||
for k in 0..=u8::MAX {
|
||||
let writer =
|
||||
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension);
|
||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
||||
let writer = arroy::Writer::new(vector_arroy, k, dimension);
|
||||
if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -634,16 +634,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
let writer_index = (embedder_index as u16) << 8;
|
||||
// FIXME: allow customizing distance
|
||||
let writers: Vec<_> = (0..=u8::MAX)
|
||||
.map(|k| {
|
||||
arroy::Writer::new(
|
||||
index.vector_arroy,
|
||||
writer_index | (k as u16),
|
||||
expected_dimension,
|
||||
)
|
||||
})
|
||||
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
||||
.map(|k| arroy::Writer::new(index.vector_arroy, k, expected_dimension))
|
||||
.collect();
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
|
|
|
@ -442,3 +442,9 @@ impl DistributionShift {
|
|||
pub const fn is_cuda_enabled() -> bool {
|
||||
cfg!(feature = "cuda")
|
||||
}
|
||||
|
||||
pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||
let embedder_id = (embedder_id as u16) << 8;
|
||||
|
||||
(0..=u8::MAX).map(move |k| embedder_id | (k as u16))
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue