Merge branch 'main' into settings-customizing-tokenization

This commit is contained in:
ManyTheFish 2023-08-08 16:08:16 +02:00
commit 4a21fecf67
166 changed files with 2252 additions and 1072 deletions

View file

@ -1,20 +1,36 @@
use std::ops;
use instant_distance::Point;
use serde::{Deserialize, Serialize};
use space::Metric;
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct DotProduct;
use crate::normalize_vector;
impl Metric<Vec<f32>> for DotProduct {
type Unit = u32;
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct NDotProductPoint(Vec<f32>);
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
//
// Here is a playground that validate the ordering of the bit representation of floats in range 0.0..=1.0:
// <https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=6c59e31a3cc5036b32edf51e8937b56e>
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let dist = 1.0 - dot_product_similarity(a, b);
impl NDotProductPoint {
pub fn new(point: Vec<f32>) -> Self {
NDotProductPoint(normalize_vector(point))
}
pub fn into_inner(self) -> Vec<f32> {
self.0
}
}
impl ops::Deref for NDotProductPoint {
type Target = [f32];
fn deref(&self) -> &Self::Target {
self.0.as_slice()
}
}
impl Point for NDotProductPoint {
fn distance(&self, other: &Self) -> f32 {
let dist = 1.0 - dot_product_similarity(&self.0, &other.0);
debug_assert!(!dist.is_nan());
dist.to_bits()
dist
}
}

View file

@ -0,0 +1,27 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
pub struct BEU16StrCodec;
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
type DItem = (u16, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n_bytes, str_bytes) = bytes.split_at(2);
let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
let s = str::from_utf8(str_bytes).ok()?;
Some((n, s))
}
}
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
type EItem = (u16, &'a str);
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s.len() + 2);
bytes.extend_from_slice(&n.to_be_bytes());
bytes.extend_from_slice(s.as_bytes());
Some(Cow::Owned(bytes))
}
}

View file

@ -1,3 +1,4 @@
mod beu16_str_codec;
mod beu32_str_codec;
mod byte_slice_ref;
pub mod facet;
@ -14,6 +15,7 @@ mod str_str_u8_codec;
pub use byte_slice_ref::ByteSliceRefCodec;
pub use str_ref::StrRefCodec;
pub use self::beu16_str_codec::BEU16StrCodec;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec;

View file

@ -8,12 +8,11 @@ use charabia::{Language, Script};
use heed::flags::Flags;
use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
use rand_pcg::Pcg32;
use roaring::RoaringBitmap;
use rstar::RTree;
use time::OffsetDateTime;
use crate::distance::DotProduct;
use crate::distance::NDotProductPoint;
use crate::error::{InternalError, UserError};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@ -21,7 +20,9 @@ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec,
};
use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::heed_codec::{
BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec,
};
use crate::readable_slices::ReadableSlices;
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -31,7 +32,7 @@ use crate::{
};
/// The HNSW data-structure that we serialize, fill and search in.
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
pub type Hnsw = instant_distance::Hnsw<NDotProductPoint>;
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
@ -100,6 +101,7 @@ pub mod db_name {
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FACET_ID_NORMALIZED_STRING_STRINGS: &str = "facet-id-normalized-string-strings";
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@ -161,6 +163,8 @@ pub struct Index {
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
/// Maps the facet field id of the normalized-for-search string facets with their original versions.
pub facet_id_normalized_string_strings: Database<BEU16StrCodec, SerdeJson<BTreeSet<String>>>,
/// Maps the facet field id of the string facets with an FST containing all the facets values.
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
@ -185,7 +189,7 @@ impl Index {
) -> Result<Index> {
use db_name::*;
options.max_dbs(24);
options.max_dbs(25);
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?;
@ -215,6 +219,8 @@ impl Index {
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids =
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_normalized_string_strings =
env.create_database(&mut wtxn, Some(FACET_ID_NORMALIZED_STRING_STRINGS))?;
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
let facet_id_exists_docids =
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
@ -250,6 +256,7 @@ impl Index {
field_id_word_count_docids,
facet_id_f64_docids,
facet_id_string_docids,
facet_id_normalized_string_strings,
facet_id_string_fst,
facet_id_exists_docids,
facet_id_is_null_docids,

View file

@ -51,9 +51,10 @@ pub use self::error::{
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{
BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec,
CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec,
RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec,
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
UncheckedU8StrStrCodec,
};
pub use self::index::Index;
pub use self::search::{

View file

@ -10,7 +10,7 @@ pub enum ScoreDetails {
Fid(Rank),
Position(Rank),
ExactAttribute(ExactAttribute),
Exactness(Rank),
ExactWords(ExactWords),
Sort(Sort),
GeoSort(GeoSort),
}
@ -28,7 +28,7 @@ impl ScoreDetails {
ScoreDetails::Fid(details) => Some(*details),
ScoreDetails::Position(details) => Some(*details),
ScoreDetails::ExactAttribute(details) => Some(details.rank()),
ScoreDetails::Exactness(details) => Some(*details),
ScoreDetails::ExactWords(details) => Some(details.rank()),
ScoreDetails::Sort(_) => None,
ScoreDetails::GeoSort(_) => None,
}
@ -84,7 +84,7 @@ impl ScoreDetails {
// For now, fid is a virtual rule always followed by the "position" rule
let fid_details = serde_json::json!({
"order": order,
"attribute_ranking_order_score": fid.local_score(),
"attributeRankingOrderScore": fid.local_score(),
});
details_map.insert("attribute".into(), fid_details);
order += 1;
@ -102,7 +102,7 @@ impl ScoreDetails {
};
attribute_details
.insert("query_word_distance_score".into(), position.local_score().into());
.insert("queryWordDistanceScore".into(), position.local_score().into());
let score = Rank::global_score([fid_details, *position].iter().copied());
attribute_details.insert("score".into(), score.into());
@ -117,7 +117,7 @@ impl ScoreDetails {
details_map.insert("exactness".into(), exactness_details);
order += 1;
}
ScoreDetails::Exactness(details) => {
ScoreDetails::ExactWords(details) => {
// For now, exactness is a virtual rule always preceded by the "ExactAttribute" rule
let exactness_details = details_map
.get_mut("exactness")
@ -129,9 +129,16 @@ impl ScoreDetails {
== &serde_json::json!(ExactAttribute::NoExactMatch)
{
let score = Rank::global_score(
[ExactAttribute::NoExactMatch.rank(), *details].iter().copied(),
[ExactAttribute::NoExactMatch.rank(), details.rank()].iter().copied(),
);
*exactness_details.get_mut("score").expect("missing score") = score.into();
// tiny detail, but we want the score to be the last displayed field,
// so we're removing it here, adding the other fields, then adding the new score
exactness_details.remove("score");
exactness_details
.insert("matchingWords".into(), details.matching_words.into());
exactness_details
.insert("maxMatchingWords".into(), details.max_matching_words.into());
exactness_details.insert("score".into(), score.into());
}
// do not update the order since this was already done by exactAttribute
}
@ -209,8 +216,34 @@ impl Words {
Rank { rank: self.matching_words, max_rank: self.max_matching_words }
}
pub(crate) fn from_rank(rank: Rank) -> Words {
Words { matching_words: rank.rank, max_matching_words: rank.max_rank }
pub(crate) fn from_rank(rank: Rank) -> Self {
Self { matching_words: rank.rank, max_matching_words: rank.max_rank }
}
}
/// Structure that is super similar to [`Words`], but whose semantics is a bit distinct.
///
/// In exactness, the number of matching words can actually be 0 with a non-zero score,
/// if no words from the query appear exactly in the document.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ExactWords {
pub matching_words: u32,
pub max_matching_words: u32,
}
impl ExactWords {
pub fn rank(&self) -> Rank {
// 0 matching words means last rank (1)
Rank { rank: self.matching_words + 1, max_rank: self.max_matching_words + 1 }
}
pub(crate) fn from_rank(rank: Rank) -> Self {
// last rank (1) means that 0 words from the query appear exactly in the document.
// first rank (max_rank) means that (max_rank - 1) words from the query appear exactly in the document.
Self {
matching_words: rank.rank.saturating_sub(1),
max_matching_words: rank.max_rank.saturating_sub(1),
}
}
}
@ -223,7 +256,7 @@ pub struct Typo {
impl Typo {
pub fn rank(&self) -> Rank {
Rank {
rank: self.max_typo_count - self.typo_count + 1,
rank: (self.max_typo_count + 1).saturating_sub(self.typo_count),
max_rank: (self.max_typo_count + 1),
}
}
@ -236,7 +269,10 @@ impl Typo {
// rank + typo = max_rank
// typo = max_rank - rank
pub fn from_rank(rank: Rank) -> Typo {
Typo { typo_count: rank.max_rank - rank.rank, max_typo_count: rank.max_rank - 1 }
Typo {
typo_count: rank.max_rank.saturating_sub(rank.rank),
max_typo_count: rank.max_rank.saturating_sub(1),
}
}
}

View file

@ -1,5 +1,8 @@
use std::fmt;
use std::ops::ControlFlow;
use charabia::normalizer::NormalizerOption;
use charabia::Normalize;
use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
@ -14,8 +17,8 @@ use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::{
execute_search, normalize_facet, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index,
Result, SearchContext, BEU16,
execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result,
SearchContext, BEU16,
};
// Building these factories is not free.
@ -301,29 +304,28 @@ impl<'a> SearchForFacetValues<'a> {
match self.query.as_ref() {
Some(query) => {
let query = normalize_facet(query);
let query = query.as_str();
let options = NormalizerOption { lossy: true, ..Default::default() };
let query = query.normalize(&options);
let query = query.as_ref();
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
let field_authorizes_typos =
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
if authorize_typos && field_authorizes_typos {
let mut results = vec![];
let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: query };
if let Some(FacetGroupValue { bitmap, .. }) =
index.facet_id_string_docids.get(rtxn, &key)?
{
let count = search_candidates.intersection_len(&bitmap);
if count != 0 {
let value = self
.one_original_value_of(fid, query, bitmap.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
}
let mut results = vec![];
if fst.contains(query) {
self.fetch_original_facets_using_normalized(
fid,
query,
query,
&search_candidates,
&mut results,
)?;
}
Ok(results)
} else {
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
@ -338,60 +340,41 @@ impl<'a> SearchForFacetValues<'a> {
};
let mut stream = fst.search(automaton).into_stream();
let mut length = 0;
let mut results = vec![];
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!(
"the facet value is missing from the facet database: {key:?}"
);
continue;
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, value, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
if self
.fetch_original_facets_using_normalized(
fid,
value,
query,
&search_candidates,
&mut results,
)?
.is_break()
{
break;
}
}
}
Ok(results)
Ok(results)
}
} else {
let automaton = Str::new(query).starts_with();
let mut stream = fst.search(automaton).into_stream();
let mut results = vec![];
let mut length = 0;
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!(
"the facet value is missing from the facet database: {key:?}"
);
continue;
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, value, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
if self
.fetch_original_facets_using_normalized(
fid,
value,
query,
&search_candidates,
&mut results,
)?
.is_break()
{
break;
}
}
@ -401,7 +384,6 @@ impl<'a> SearchForFacetValues<'a> {
}
None => {
let mut results = vec![];
let mut length = 0;
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
@ -412,9 +394,8 @@ impl<'a> SearchForFacetValues<'a> {
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
.unwrap_or_else(|| left_bound.to_string());
results.push(FacetValueHit { value, count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
if results.len() >= MAX_NUMBER_OF_FACETS {
break;
}
}
@ -422,6 +403,50 @@ impl<'a> SearchForFacetValues<'a> {
}
}
}
fn fetch_original_facets_using_normalized(
&self,
fid: FieldId,
value: &str,
query: &str,
search_candidates: &RoaringBitmap,
results: &mut Vec<FacetValueHit>,
) -> Result<ControlFlow<()>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let database = index.facet_id_normalized_string_strings;
let key = (fid, value);
let original_strings = match database.get(rtxn, &key)? {
Some(original_strings) => original_strings,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
for original in original_strings {
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!("the facet value is missing from the facet database: {key:?}");
return Ok(ControlFlow::Continue(()));
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
let value = self
.one_original_value_of(fid, &original, docids.min().unwrap())?
.unwrap_or_else(|| query.to_string());
results.push(FacetValueHit { value, count });
}
if results.len() >= MAX_NUMBER_OF_FACETS {
return Ok(ControlFlow::Break(()));
}
}
Ok(ControlFlow::Continue(()))
}
}
#[derive(Debug, Clone, serde::Serialize, PartialEq)]

View file

@ -100,7 +100,7 @@ fn facet_number_values<'a>(
}
/// Return an iterator over each string value in the given field of the given document.
fn facet_string_values<'a>(
pub fn facet_string_values<'a>(
docid: u32,
field_id: u16,
index: &Index,

View file

@ -6,6 +6,7 @@ use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
use super::facet_string_values;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
use crate::score_details::{self, ScoreDetails};
@ -157,23 +158,7 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
let mut documents = self
.geo_candidates
.iter()
.map(|id| -> Result<_> {
Ok((
id,
[
facet_number_values(id, lat, ctx.index, ctx.txn)?
.next()
.expect("A geo faceted document doesn't contain any lat")?
.0
.2,
facet_number_values(id, lng, ctx.index, ctx.txn)?
.next()
.expect("A geo faceted document doesn't contain any lng")?
.0
.2,
],
))
})
.map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) })
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
// computing the distance between two points is expensive thus we cache the result
documents
@ -185,6 +170,37 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
}
}
/// Extracts the lat and long values from a single document.
///
/// If it is not able to find it in the facet number index it will extract it
/// from the facet string index and parse it as f64 (as the geo extraction behaves).
fn geo_value(
docid: u32,
field_lat: u16,
field_lng: u16,
index: &Index,
rtxn: &RoTxn,
) -> Result<[f64; 2]> {
let extract_geo = |geo_field: u16| -> Result<f64> {
match facet_number_values(docid, geo_field, index, rtxn)?.next() {
Some(Ok(((_, _, geo), ()))) => Ok(geo),
Some(Err(e)) => Err(e.into()),
None => match facet_string_values(docid, geo_field, index, rtxn)?.next() {
Some(Ok((_, geo))) => {
Ok(geo.parse::<f64>().expect("cannot parse geo field as f64"))
}
Some(Err(e)) => Err(e.into()),
None => panic!("A geo faceted document doesn't contain any lat or lng"),
},
}
};
let lat = extract_geo(field_lat)?;
let lng = extract_geo(field_lng)?;
Ok([lat, lng])
}
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
fn id(&self) -> String {
"geo_sort".to_owned()

View file

@ -28,7 +28,7 @@ use db_cache::DatabaseCache;
use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn;
use hnsw::Searcher;
use instant_distance::Search;
use interner::{DedupInterner, Interner};
pub use logger::visual::VisualSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger};
@ -40,18 +40,18 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap;
use sort::Sort;
use space::Neighbor;
use self::distinct::facet_string_values;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use crate::distance::NDotProductPoint;
use crate::error::FieldIdMapMissingEntry;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::{
normalize_vector, AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy,
UserError, BEU32,
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
};
/// A structure used throughout the execution of a search query.
@ -85,7 +85,12 @@ impl<'ctx> SearchContext<'ctx> {
let searchable_names = self.index.searchable_fields(self.txn)?;
let mut restricted_fids = Vec::new();
let mut contains_wildcard = false;
for field_name in searchable_attributes {
if field_name == "*" {
contains_wildcard = true;
continue;
}
let searchable_contains_name =
searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name));
let fid = match (fids_map.id(field_name), searchable_contains_name) {
@ -99,8 +104,10 @@ impl<'ctx> SearchContext<'ctx> {
}
.into())
}
// The field is not searchable, but the searchableAttributes are set to * => ignore field
(None, None) => continue,
// The field is not searchable => User error
_otherwise => {
(_fid, Some(false)) => {
let mut valid_fields: BTreeSet<_> =
fids_map.names().map(String::from).collect();
@ -132,7 +139,7 @@ impl<'ctx> SearchContext<'ctx> {
restricted_fids.push(fid);
}
self.restricted_fids = Some(restricted_fids);
self.restricted_fids = (!contains_wildcard).then_some(restricted_fids);
Ok(())
}
@ -437,29 +444,31 @@ pub fn execute_search(
check_sort_criteria(ctx, sort_criteria.as_ref())?;
if let Some(vector) = vector {
let mut searcher = Searcher::new();
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
let ef = hnsw.len().min(100);
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
let vector = normalize_vector(vector.clone());
let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]);
let mut search = Search::default();
let docids = match ctx.index.vector_hnsw(ctx.txn)? {
Some(hnsw) => {
let vector = NDotProductPoint::new(vector.clone());
let neighbors = hnsw.search(&vector, &mut search);
let mut docids = Vec::new();
let mut uniq_docids = RoaringBitmap::new();
for Neighbor { index, distance: _ } in neighbors.iter() {
let index = BEU32::new(*index as u32);
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
if universe.contains(docid) && uniq_docids.insert(docid) {
docids.push(docid);
if docids.len() == (from + length) {
break;
let mut docids = Vec::new();
let mut uniq_docids = RoaringBitmap::new();
for instant_distance::Item { distance: _, pid, point: _ } in neighbors {
let index = BEU32::new(pid.into_inner());
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
if universe.contains(docid) && uniq_docids.insert(docid) {
docids.push(docid);
if docids.len() == (from + length) {
break;
}
}
}
}
}
// return the nearest documents that are also part of the candidates
// along with a dummy list of scores that are useless in this context.
let docids: Vec<_> = docids.into_iter().skip(from).take(length).collect();
// return the nearest documents that are also part of the candidates
// along with a dummy list of scores that are useless in this context.
docids.into_iter().skip(from).take(length).collect()
}
None => Vec::new(),
};
return Ok(PartialSearchResult {
candidates: universe,

View file

@ -1,7 +1,7 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{Rank, ScoreDetails};
use crate::score_details::{self, Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
@ -87,6 +87,6 @@ impl RankingRuleGraphTrait for ExactnessGraph {
}
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Exactness(rank)
ScoreDetails::ExactWords(score_details::ExactWords::from_rank(rank))
}
}

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 9,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 9,
max_rank: 9,
ExactWords(
ExactWords {
matching_words: 8,
max_matching_words: 8,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 9,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 8,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -135,10 +135,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 7,
},
),
],
@ -155,10 +155,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 7,
},
),
],
@ -175,10 +175,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -195,10 +195,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 5,
},
),
],
@ -215,10 +215,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
],
@ -235,10 +235,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 4,
},
),
],
@ -255,10 +255,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -275,10 +275,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 3,
},
),
],
@ -295,10 +295,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -315,10 +315,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 2,
},
),
],
@ -335,10 +335,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -355,10 +355,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 7,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 7,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 7,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 1,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 0,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 7,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Typo(
@ -41,10 +41,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
Typo(
@ -67,10 +67,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
Typo(
@ -93,10 +93,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
Typo(
@ -119,10 +119,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 4,
},
),
Typo(

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 9,
max_rank: 9,
ExactWords(
ExactWords {
matching_words: 8,
max_matching_words: 8,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
],
@ -135,10 +135,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -155,10 +155,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -175,10 +175,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -135,10 +135,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -135,10 +135,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
@ -41,10 +41,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
@ -67,10 +67,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
@ -41,10 +41,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
@ -67,10 +67,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
@ -93,10 +93,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(
@ -119,10 +119,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(
@ -145,10 +145,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(
@ -171,10 +171,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(
@ -197,10 +197,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(
@ -223,10 +223,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
Proximity(

View file

@ -21,10 +21,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
],
@ -47,10 +47,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
],
@ -73,10 +73,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 4,
},
),
],
@ -99,10 +99,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 4,
},
),
],

View file

@ -15,10 +15,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 10,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 9,
max_matching_words: 9,
},
),
],
@ -35,10 +35,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 7,
max_rank: 10,
ExactWords(
ExactWords {
matching_words: 6,
max_matching_words: 9,
},
),
],
@ -55,10 +55,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 9,
max_rank: 9,
ExactWords(
ExactWords {
matching_words: 8,
max_matching_words: 8,
},
),
],
@ -75,10 +75,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 9,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 8,
},
),
],
@ -95,10 +95,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -115,10 +115,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 8,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 7,
max_matching_words: 7,
},
),
],
@ -135,10 +135,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 7,
},
),
],
@ -155,10 +155,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 8,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 7,
},
),
],
@ -175,10 +175,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 6,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 5,
max_matching_words: 5,
},
),
],
@ -195,10 +195,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 6,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 5,
},
),
],
@ -215,10 +215,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 5,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 4,
max_matching_words: 4,
},
),
],
@ -235,10 +235,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 5,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 4,
},
),
],
@ -255,10 +255,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -275,10 +275,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 3,
},
),
],
@ -295,10 +295,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -315,10 +315,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 2,
},
),
],
@ -335,10 +335,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -355,10 +355,10 @@ expression: "format!(\"{document_ids_scores:#?}\")"
ExactAttribute(
ExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],

View file

@ -37,10 +37,10 @@ expression: "format!(\"{document_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -78,10 +78,10 @@ expression: "format!(\"{document_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],
@ -119,10 +119,10 @@ expression: "format!(\"{document_scores:#?}\")"
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],

View file

@ -120,10 +120,10 @@ fn test_ignore_stop_words() {
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -173,10 +173,10 @@ fn test_ignore_stop_words() {
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -226,10 +226,10 @@ fn test_ignore_stop_words() {
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -278,10 +278,10 @@ fn test_ignore_stop_words() {
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 3,
max_rank: 3,
ExactWords(
ExactWords {
matching_words: 2,
max_matching_words: 2,
},
),
],
@ -337,10 +337,10 @@ fn test_stop_words_in_phrase() {
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -378,10 +378,10 @@ fn test_stop_words_in_phrase() {
ExactAttribute(
MatchesStart,
),
Exactness(
Rank {
rank: 2,
max_rank: 2,
ExactWords(
ExactWords {
matching_words: 1,
max_matching_words: 1,
},
),
],
@ -430,10 +430,10 @@ fn test_stop_words_in_phrase() {
ExactAttribute(
NoExactMatch,
),
Exactness(
Rank {
rank: 4,
max_rank: 4,
ExactWords(
ExactWords {
matching_words: 3,
max_matching_words: 3,
},
),
],

View file

@ -36,6 +36,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids,
facet_id_f64_docids,
facet_id_string_docids,
facet_id_normalized_string_strings,
facet_id_string_fst,
facet_id_exists_docids,
facet_id_is_null_docids,
@ -94,6 +95,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_fid_docids.clear(self.wtxn)?;
script_language_docids.clear(self.wtxn)?;
facet_id_f64_docids.clear(self.wtxn)?;
facet_id_normalized_string_strings.clear(self.wtxn)?;
facet_id_string_fst.clear(self.wtxn)?;
facet_id_exists_docids.clear(self.wtxn)?;
facet_id_is_null_docids.clear(self.wtxn)?;

View file

@ -4,10 +4,9 @@ use std::collections::{BTreeSet, HashMap, HashSet};
use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
use heed::{BytesDecode, BytesEncode, Database, RwIter};
use hnsw::Searcher;
use instant_distance::PointId;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use space::KnnPoints;
use time::OffsetDateTime;
use super::facet::delete::FacetsDelete;
@ -239,6 +238,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_fid_docids,
facet_id_f64_docids: _,
facet_id_string_docids: _,
facet_id_normalized_string_strings: _,
facet_id_string_fst: _,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
@ -438,24 +438,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// An ugly and slow way to remove the vectors from the HNSW
// It basically reconstructs the HNSW from scratch without editing the current one.
let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
if !current_hnsw.is_empty() {
let mut new_hnsw = Hnsw::default();
let mut searcher = Searcher::new();
let mut new_vector_id_docids = Vec::new();
if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? {
let mut points = Vec::new();
let mut docids = Vec::new();
for result in vector_id_docid.iter(self.wtxn)? {
let (vector_id, docid) = result?;
if !self.to_delete_docids.contains(docid.get()) {
let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
let vector_id = new_hnsw.insert(vector, &mut searcher);
new_vector_id_docids.push((vector_id as u32, docid));
let pid = PointId::from(vector_id.get());
let vector = current_hnsw[pid].clone();
points.push(vector);
docids.push(docid);
}
}
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
vector_id_docid.clear(self.wtxn)?;
for (vector_id, docid) in new_vector_id_docids {
vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
for (pid, docid) in pids.into_iter().zip(docids) {
vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?;
}
self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
}

View file

@ -76,9 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
pub const FACET_GROUP_SIZE: u8 = 4;
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::iter::FromIterator;
use heed::types::DecodeIgnore;
use charabia::normalizer::{Normalize, NormalizerOption};
use grenad::{CompressionType, SortAlgorithm};
use heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
use heed::BytesEncode;
use log::debug;
use time::OffsetDateTime;
@ -87,7 +92,9 @@ use super::FacetsUpdateBulk;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result, BEU16};
use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, BEU16};
pub mod bulk;
pub mod delete;
@ -159,26 +166,69 @@ impl<'i> FacetsUpdate<'i> {
incremental_update.execute(wtxn)?;
}
// We clear the list of normalized-for-search facets
// and the previous FSTs to compute everything from scratch
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
self.index.facet_id_string_fst.clear(wtxn)?;
// As we can't use the same write transaction to read and write in two different databases
// we must create a temporary sorter that we will write into LMDB afterward.
// As multiple unnormalized facet values can become the same normalized facet value
// we must merge them together.
let mut sorter = create_sorter(
SortAlgorithm::Unstable,
merge_btreeset_string,
CompressionType::None,
None,
None,
None,
);
// We iterate on the list of original, semi-normalized, facet values
// and normalize them for search, inserting them in LMDB in any given order.
let options = NormalizerOption { lossy: true, ..Default::default() };
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let normalized_facet = left_bound.normalize(&options);
let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?;
sorter.insert(key, val)?;
}
}
// In this loop we don't need to take care of merging bitmaps
// as the grenad sorter already merged them for us.
let mut merger_iter = sorter.into_stream_merger_iter()?;
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
self.index
.facet_id_normalized_string_strings
.remap_types::<ByteSlice, ByteSlice>()
.put(wtxn, key_bytes, btreeset_bytes)?;
}
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
let database =
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, _) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(left_bound)?;
let ((field_id, normalized_facet), _) = result?;
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
}
}
@ -187,9 +237,6 @@ impl<'i> FacetsUpdate<'i> {
text_fsts.push((field_id, fst));
}
// We remove all of the previous FSTs that were in this database
self.index.facet_id_string_fst.clear(wtxn)?;
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;

View file

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::collections::BTreeSet;
use std::io;
use std::result::Result as StdResult;
@ -44,6 +45,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
}
}
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// TODO improve the perf by using a `#[borrow] Cow<str>`.
let strings: BTreeSet<String> = values
.iter()
.map(AsRef::as_ref)
.map(serde_json::from_slice::<BTreeSet<String>>)
.map(StdResult::unwrap)
.reduce(|mut current, new| {
for x in new {
current.insert(x);
}
current
})
.unwrap();
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
}
}
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(values[0].clone())
}

View file

@ -13,9 +13,9 @@ pub use grenad_helpers::{
GrenadParameters, MergeableReader,
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
MergeFn,
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
serialize_roaring_bitmap, MergeFn,
};
use crate::MAX_WORD_LENGTH;

View file

@ -26,7 +26,7 @@ pub use self::enrich::{
};
pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};

View file

@ -9,22 +9,19 @@ use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap;
use space::KnnPoints;
use super::helpers::{
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn};
use crate::distance::NDotProductPoint;
use crate::error::UserError;
use crate::facet::FacetType;
use crate::index::Hnsw;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
use crate::{
lat_lng_to_xyz, normalize_vector, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result,
BEU32,
};
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -292,17 +289,20 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
Some(result) => {
let (vector_id, _) = result?;
Some(hnsw.get_point(vector_id.get() as usize).len())
}
None => None,
let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
None => Default::default(),
};
// Convert the PointIds into DocumentIds
let mut docids = Vec::new();
for pid in pids {
let docid =
index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
docids.push(docid.get());
}
let mut expected_dimensions = points.get(0).map(|p| p.len());
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
@ -318,12 +318,26 @@ pub(crate) fn write_typed_chunk_into_index(
return Err(UserError::InvalidVectorDimensions { expected, found })?;
}
let vector = normalize_vector(vector);
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
points.push(NDotProductPoint::new(vector));
docids.push(docid);
}
log::debug!("There are {} entries in the HNSW so far", hnsw.len());
index.put_vector_hnsw(wtxn, &hnsw)?;
assert_eq!(docids.len(), points.len());
let hnsw_length = points.len();
let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
index.vector_id_docid.clear(wtxn)?;
for (docid, pid) in docids.into_iter().zip(pids) {
index.vector_id_docid.put(
wtxn,
&BEU32::new(pid.into_inner()),
&BEU32::new(docid),
)?;
}
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
index.put_vector_hnsw(wtxn, &new_hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();

View file

@ -4,8 +4,9 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele
pub use self::facet::bulk::FacetsUpdateBulk;
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
pub use self::index_documents::{
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
MergeFn,
};
pub use self::indexer_config::IndexerConfig;
pub use self::prefix_word_pairs::{

View file

@ -466,13 +466,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let current = self.index.stop_words(self.wtxn)?;
// Apply an unlossy normalization on stop_words
let stop_words = stop_words
let stop_words: BTreeSet<String> = stop_words
.iter()
.map(|w| w.as_str().normalize(&Default::default()).into_owned());
.map(|w| w.as_str().normalize(&Default::default()).into_owned())
.collect();
// since we can't compare a BTreeSet with an FST we are going to convert the
// BTreeSet to an FST and then compare bytes per bytes the two FSTs.
let fst = fst::Set::from_iter(stop_words)?;
let fst = fst::Set::from_iter(stop_words.into_iter())?;
// Does the new FST differ from the previous one?
if current