mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
search: introduce hitmaker
This commit is contained in:
parent
2123d76089
commit
d3a6d2a6fa
@ -1,6 +1,6 @@
|
|||||||
use core::fmt;
|
use core::fmt;
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@ -913,8 +913,13 @@ pub fn perform_search(
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
};
|
};
|
||||||
|
|
||||||
let documents =
|
let documents = make_hits(
|
||||||
make_hits(index, &rtxn, format, matching_words, documents_ids, document_scores)?;
|
index,
|
||||||
|
&rtxn,
|
||||||
|
format,
|
||||||
|
matching_words,
|
||||||
|
documents_ids.iter().copied().zip(document_scores.iter()),
|
||||||
|
)?;
|
||||||
|
|
||||||
let number_of_hits = min(candidates.len() as usize, max_total_hits);
|
let number_of_hits = min(candidates.len() as usize, max_total_hits);
|
||||||
let hits_info = if is_finite_pagination {
|
let hits_info = if is_finite_pagination {
|
||||||
@ -1043,131 +1048,191 @@ impl RetrieveVectors {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_hits(
|
struct HitMaker<'a> {
|
||||||
index: &Index,
|
index: &'a Index,
|
||||||
rtxn: &RoTxn<'_>,
|
rtxn: &'a RoTxn<'a>,
|
||||||
format: AttributesFormat,
|
fields_ids_map: FieldsIdsMap,
|
||||||
matching_words: milli::MatchingWords,
|
displayed_ids: BTreeSet<FieldId>,
|
||||||
documents_ids: Vec<u32>,
|
vectors_fid: Option<FieldId>,
|
||||||
document_scores: Vec<Vec<ScoreDetails>>,
|
retrieve_vectors: RetrieveVectors,
|
||||||
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
|
to_retrieve_ids: BTreeSet<FieldId>,
|
||||||
let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
|
embedding_configs: Vec<milli::index::IndexEmbeddingConfig>,
|
||||||
let displayed_ids =
|
formatter_builder: MatcherBuilder<'a>,
|
||||||
index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
|
formatted_options: BTreeMap<FieldId, FormatOptions>,
|
||||||
|
show_ranking_score: bool,
|
||||||
|
show_ranking_score_details: bool,
|
||||||
|
sort: Option<Vec<String>>,
|
||||||
|
show_matches_position: bool,
|
||||||
|
}
|
||||||
|
|
||||||
let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
impl<'a> HitMaker<'a> {
|
||||||
|
pub fn tokenizer<'b>(
|
||||||
|
script_lang_map: &'b HashMap<milli::tokenizer::Script, Vec<milli::tokenizer::Language>>,
|
||||||
|
dictionary: Option<&'b [&'b str]>,
|
||||||
|
separators: Option<&'b [&'b str]>,
|
||||||
|
) -> milli::tokenizer::Tokenizer<'b> {
|
||||||
|
let mut tokenizer_builder = TokenizerBuilder::default();
|
||||||
|
tokenizer_builder.create_char_map(true);
|
||||||
|
if !script_lang_map.is_empty() {
|
||||||
|
tokenizer_builder.allow_list(script_lang_map);
|
||||||
|
}
|
||||||
|
|
||||||
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
if let Some(separators) = separators {
|
||||||
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
tokenizer_builder.separators(separators);
|
||||||
(None, _) => false,
|
}
|
||||||
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
|
||||||
(Some(_), None) => true,
|
|
||||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
|
||||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
|
||||||
};
|
|
||||||
|
|
||||||
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
|
if let Some(dictionary) = dictionary {
|
||||||
if vectors_is_hidden {
|
tokenizer_builder.words_dict(dictionary);
|
||||||
RetrieveVectors::Hide
|
}
|
||||||
|
|
||||||
|
tokenizer_builder.into_tokenizer()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn formatter_builder(
|
||||||
|
matching_words: milli::MatchingWords,
|
||||||
|
tokenizer: milli::tokenizer::Tokenizer<'_>,
|
||||||
|
) -> MatcherBuilder<'_> {
|
||||||
|
let formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
|
||||||
|
|
||||||
|
formatter_builder
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(
|
||||||
|
index: &'a Index,
|
||||||
|
rtxn: &'a RoTxn<'a>,
|
||||||
|
format: AttributesFormat,
|
||||||
|
mut formatter_builder: MatcherBuilder<'a>,
|
||||||
|
) -> Result<Self, MeilisearchHttpError> {
|
||||||
|
formatter_builder.crop_marker(format.crop_marker);
|
||||||
|
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
||||||
|
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
||||||
|
|
||||||
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
|
let displayed_ids = index
|
||||||
|
.displayed_fields_ids(rtxn)?
|
||||||
|
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
|
||||||
|
|
||||||
|
let vectors_fid =
|
||||||
|
fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||||
|
|
||||||
|
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
||||||
|
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
||||||
|
(None, _) => false,
|
||||||
|
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
||||||
|
(Some(_), None) => true,
|
||||||
|
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
||||||
|
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
||||||
|
};
|
||||||
|
|
||||||
|
let displayed_ids =
|
||||||
|
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
||||||
|
|
||||||
|
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
|
||||||
|
if vectors_is_hidden {
|
||||||
|
RetrieveVectors::Hide
|
||||||
|
} else {
|
||||||
|
RetrieveVectors::Retrieve
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
RetrieveVectors::Retrieve
|
format.retrieve_vectors
|
||||||
}
|
};
|
||||||
} else {
|
|
||||||
format.retrieve_vectors
|
|
||||||
};
|
|
||||||
|
|
||||||
let displayed_ids =
|
let fids = |attrs: &BTreeSet<String>| {
|
||||||
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
let mut ids = BTreeSet::new();
|
||||||
let fids = |attrs: &BTreeSet<String>| {
|
for attr in attrs {
|
||||||
let mut ids = BTreeSet::new();
|
if attr == "*" {
|
||||||
for attr in attrs {
|
ids.clone_from(&displayed_ids);
|
||||||
if attr == "*" {
|
break;
|
||||||
ids.clone_from(&displayed_ids);
|
}
|
||||||
break;
|
|
||||||
|
if let Some(id) = fields_ids_map.id(attr) {
|
||||||
|
ids.insert(id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ids
|
||||||
|
};
|
||||||
|
let to_retrieve_ids: BTreeSet<_> = format
|
||||||
|
.attributes_to_retrieve
|
||||||
|
.as_ref()
|
||||||
|
.map(fids)
|
||||||
|
.unwrap_or_else(|| displayed_ids.clone())
|
||||||
|
.intersection(&displayed_ids)
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
|
||||||
if let Some(id) = fields_ids_map.id(attr) {
|
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
||||||
ids.insert(id);
|
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
||||||
}
|
let formatted_options = compute_formatted_options(
|
||||||
}
|
&attr_to_highlight,
|
||||||
ids
|
&attr_to_crop,
|
||||||
};
|
format.crop_length,
|
||||||
let to_retrieve_ids: BTreeSet<_> = format
|
&to_retrieve_ids,
|
||||||
.attributes_to_retrieve
|
&fields_ids_map,
|
||||||
.as_ref()
|
&displayed_ids,
|
||||||
.map(fids)
|
);
|
||||||
.unwrap_or_else(|| displayed_ids.clone())
|
|
||||||
.intersection(&displayed_ids)
|
|
||||||
.cloned()
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
let embedding_configs = index.embedding_configs(rtxn)?;
|
||||||
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
|
||||||
let formatted_options = compute_formatted_options(
|
Ok(Self {
|
||||||
&attr_to_highlight,
|
index,
|
||||||
&attr_to_crop,
|
rtxn,
|
||||||
format.crop_length,
|
fields_ids_map,
|
||||||
&to_retrieve_ids,
|
displayed_ids,
|
||||||
&fields_ids_map,
|
vectors_fid,
|
||||||
&displayed_ids,
|
retrieve_vectors,
|
||||||
);
|
to_retrieve_ids,
|
||||||
let mut tokenizer_builder = TokenizerBuilder::default();
|
embedding_configs,
|
||||||
tokenizer_builder.create_char_map(true);
|
formatter_builder,
|
||||||
let script_lang_map = index.script_language(rtxn)?;
|
formatted_options,
|
||||||
if !script_lang_map.is_empty() {
|
show_ranking_score: format.show_ranking_score,
|
||||||
tokenizer_builder.allow_list(&script_lang_map);
|
show_ranking_score_details: format.show_ranking_score_details,
|
||||||
|
show_matches_position: format.show_matches_position,
|
||||||
|
sort: format.sort,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
let separators = index.allowed_separators(rtxn)?;
|
|
||||||
let separators: Option<Vec<_>> =
|
pub fn make_hit(
|
||||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
&self,
|
||||||
if let Some(ref separators) = separators {
|
id: u32,
|
||||||
tokenizer_builder.separators(separators);
|
score: &[ScoreDetails],
|
||||||
}
|
) -> Result<SearchHit, MeilisearchHttpError> {
|
||||||
let dictionary = index.dictionary(rtxn)?;
|
let (_, obkv) =
|
||||||
let dictionary: Option<Vec<_>> =
|
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
|
||||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
|
||||||
if let Some(ref dictionary) = dictionary {
|
|
||||||
tokenizer_builder.words_dict(dictionary);
|
|
||||||
}
|
|
||||||
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
|
|
||||||
formatter_builder.crop_marker(format.crop_marker);
|
|
||||||
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
|
||||||
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
|
||||||
let mut documents = Vec::new();
|
|
||||||
let embedding_configs = index.embedding_configs(rtxn)?;
|
|
||||||
let documents_iter = index.documents(rtxn, documents_ids)?;
|
|
||||||
for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
|
|
||||||
// First generate a document with all the displayed fields
|
// First generate a document with all the displayed fields
|
||||||
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
|
let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;
|
||||||
|
|
||||||
let add_vectors_fid =
|
let add_vectors_fid =
|
||||||
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
|
self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);
|
||||||
|
|
||||||
// select the attributes to retrieve
|
// select the attributes to retrieve
|
||||||
let attributes_to_retrieve = to_retrieve_ids
|
let attributes_to_retrieve = self
|
||||||
|
.to_retrieve_ids
|
||||||
.iter()
|
.iter()
|
||||||
// skip the vectors_fid if RetrieveVectors::Hide
|
// skip the vectors_fid if RetrieveVectors::Hide
|
||||||
.filter(|fid| match vectors_fid {
|
.filter(|fid| match self.vectors_fid {
|
||||||
Some(vectors_fid) => {
|
Some(vectors_fid) => {
|
||||||
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
|
!(self.retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
|
||||||
}
|
}
|
||||||
None => true,
|
None => true,
|
||||||
})
|
})
|
||||||
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
|
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
|
||||||
.chain(add_vectors_fid.iter())
|
.chain(add_vectors_fid.iter())
|
||||||
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
|
.map(|&fid| self.fields_ids_map.name(fid).expect("Missing field name"));
|
||||||
|
|
||||||
let mut document =
|
let mut document =
|
||||||
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
||||||
|
|
||||||
if retrieve_vectors == RetrieveVectors::Retrieve {
|
if self.retrieve_vectors == RetrieveVectors::Retrieve {
|
||||||
// Clippy is wrong
|
// Clippy is wrong
|
||||||
#[allow(clippy::manual_unwrap_or_default)]
|
#[allow(clippy::manual_unwrap_or_default)]
|
||||||
let mut vectors = match document.remove("_vectors") {
|
let mut vectors = match document.remove("_vectors") {
|
||||||
Some(Value::Object(map)) => map,
|
Some(Value::Object(map)) => map,
|
||||||
_ => Default::default(),
|
_ => Default::default(),
|
||||||
};
|
};
|
||||||
for (name, vector) in index.embeddings(rtxn, id)? {
|
for (name, vector) in self.index.embeddings(self.rtxn, id)? {
|
||||||
let user_provided = embedding_configs
|
let user_provided = self
|
||||||
|
.embedding_configs
|
||||||
.iter()
|
.iter()
|
||||||
.find(|conf| conf.name == name)
|
.find(|conf| conf.name == name)
|
||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||||
@ -1180,21 +1245,21 @@ fn make_hits(
|
|||||||
|
|
||||||
let (matches_position, formatted) = format_fields(
|
let (matches_position, formatted) = format_fields(
|
||||||
&displayed_document,
|
&displayed_document,
|
||||||
&fields_ids_map,
|
&self.fields_ids_map,
|
||||||
&formatter_builder,
|
&self.formatter_builder,
|
||||||
&formatted_options,
|
&self.formatted_options,
|
||||||
format.show_matches_position,
|
self.show_matches_position,
|
||||||
&displayed_ids,
|
&self.displayed_ids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
if let Some(sort) = format.sort.as_ref() {
|
if let Some(sort) = self.sort.as_ref() {
|
||||||
insert_geo_distance(sort, &mut document);
|
insert_geo_distance(sort, &mut document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let ranking_score =
|
let ranking_score =
|
||||||
format.show_ranking_score.then(|| ScoreDetails::global_score(score.iter()));
|
self.show_ranking_score.then(|| ScoreDetails::global_score(score.iter()));
|
||||||
let ranking_score_details =
|
let ranking_score_details =
|
||||||
format.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter()));
|
self.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter()));
|
||||||
|
|
||||||
let hit = SearchHit {
|
let hit = SearchHit {
|
||||||
document,
|
document,
|
||||||
@ -1203,7 +1268,38 @@ fn make_hits(
|
|||||||
ranking_score_details,
|
ranking_score_details,
|
||||||
ranking_score,
|
ranking_score,
|
||||||
};
|
};
|
||||||
documents.push(hit);
|
|
||||||
|
Ok(hit)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_hits<'a>(
|
||||||
|
index: &Index,
|
||||||
|
rtxn: &RoTxn<'_>,
|
||||||
|
format: AttributesFormat,
|
||||||
|
matching_words: milli::MatchingWords,
|
||||||
|
documents_ids_scores: impl Iterator<Item = (u32, &'a Vec<ScoreDetails>)> + 'a,
|
||||||
|
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
|
||||||
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
|
let script_lang_map = index.script_language(rtxn)?;
|
||||||
|
|
||||||
|
let dictionary = index.dictionary(rtxn)?;
|
||||||
|
let dictionary: Option<Vec<_>> =
|
||||||
|
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||||
|
let separators = index.allowed_separators(rtxn)?;
|
||||||
|
let separators: Option<Vec<_>> =
|
||||||
|
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||||
|
|
||||||
|
let tokenizer =
|
||||||
|
HitMaker::tokenizer(&script_lang_map, dictionary.as_deref(), separators.as_deref());
|
||||||
|
|
||||||
|
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
|
||||||
|
|
||||||
|
let hit_maker = HitMaker::new(index, rtxn, format, formatter_builder)?;
|
||||||
|
|
||||||
|
for (id, score) in documents_ids_scores {
|
||||||
|
documents.push(hit_maker.make_hit(id, score)?);
|
||||||
}
|
}
|
||||||
Ok(documents)
|
Ok(documents)
|
||||||
}
|
}
|
||||||
@ -1319,7 +1415,13 @@ pub fn perform_similar(
|
|||||||
show_ranking_score_details,
|
show_ranking_score_details,
|
||||||
};
|
};
|
||||||
|
|
||||||
let hits = make_hits(index, &rtxn, format, Default::default(), documents_ids, document_scores)?;
|
let hits = make_hits(
|
||||||
|
index,
|
||||||
|
&rtxn,
|
||||||
|
format,
|
||||||
|
Default::default(),
|
||||||
|
documents_ids.iter().copied().zip(document_scores.iter()),
|
||||||
|
)?;
|
||||||
|
|
||||||
let max_total_hits = index
|
let max_total_hits = index
|
||||||
.pagination_max_total_hits(&rtxn)
|
.pagination_max_total_hits(&rtxn)
|
||||||
@ -1492,10 +1594,10 @@ fn make_document(
|
|||||||
Ok(document)
|
Ok(document)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_fields<'a>(
|
fn format_fields(
|
||||||
document: &Document,
|
document: &Document,
|
||||||
field_ids_map: &FieldsIdsMap,
|
field_ids_map: &FieldsIdsMap,
|
||||||
builder: &'a MatcherBuilder<'a>,
|
builder: &MatcherBuilder<'_>,
|
||||||
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
||||||
compute_matches: bool,
|
compute_matches: bool,
|
||||||
displayable_ids: &BTreeSet<FieldId>,
|
displayable_ids: &BTreeSet<FieldId>,
|
||||||
@ -1550,9 +1652,9 @@ fn format_fields<'a>(
|
|||||||
Ok((matches_position, document))
|
Ok((matches_position, document))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_value<'a>(
|
fn format_value(
|
||||||
value: Value,
|
value: Value,
|
||||||
builder: &'a MatcherBuilder<'a>,
|
builder: &MatcherBuilder<'_>,
|
||||||
format_options: Option<FormatOptions>,
|
format_options: Option<FormatOptions>,
|
||||||
infos: &mut Vec<MatchBounds>,
|
infos: &mut Vec<MatchBounds>,
|
||||||
compute_matches: bool,
|
compute_matches: bool,
|
||||||
|
Loading…
Reference in New Issue
Block a user