diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 3880fac4b..09c8b7b8c 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -226,6 +226,7 @@ InvalidIndexLimit , InvalidRequest , BAD_REQUEST ; InvalidIndexOffset , InvalidRequest , BAD_REQUEST ; InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ; InvalidIndexUid , InvalidRequest , BAD_REQUEST ; +InvalidAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; @@ -336,6 +337,9 @@ impl ErrorCode for milli::Error { UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, + UserError::InvalidSearchableAttribute { .. } => { + Code::InvalidAttributesToSearchOn + } UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 0c45f08c7..a79f95ee4 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -72,6 +72,8 @@ pub struct SearchQueryGet { crop_marker: String, #[deserr(default, error = DeserrQueryParamError)] matching_strategy: MatchingStrategy, + #[deserr(default, error = DeserrQueryParamError)] + pub attributes_to_search_on: Option>, } impl From for SearchQuery { @@ -105,6 +107,7 @@ impl From for SearchQuery { highlight_post_tag: other.highlight_post_tag, crop_marker: other.crop_marker, matching_strategy: other.matching_strategy, + attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), } } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index ec7e79692..87cfdadb3 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -79,6 +79,8 @@ pub struct SearchQuery { pub crop_marker: String, #[deserr(default, error = DeserrJsonError, default)] pub matching_strategy: MatchingStrategy, + #[deserr(default, error = DeserrJsonError, default)] + pub attributes_to_search_on: Option>, } impl SearchQuery { @@ -136,6 +138,8 @@ pub struct SearchQueryWithIndex { pub crop_marker: String, #[deserr(default, error = DeserrJsonError, default)] pub matching_strategy: MatchingStrategy, + #[deserr(default, error = DeserrJsonError, default)] + pub attributes_to_search_on: Option>, } impl SearchQueryWithIndex { @@ -162,6 +166,7 @@ impl SearchQueryWithIndex { highlight_post_tag, crop_marker, matching_strategy, + attributes_to_search_on, } = self; ( index_uid, @@ -186,6 +191,7 @@ impl SearchQueryWithIndex { highlight_post_tag, crop_marker, matching_strategy, + attributes_to_search_on, // do not use ..Default::default() here, // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` }, @@ -314,6 +320,10 @@ pub fn perform_search( search.query(query); } + if let Some(ref searchable) = query.attributes_to_search_on { + search.searchable_attributes(searchable); + } + let is_finite_pagination = query.is_finite_pagination(); search.terms_matching_strategy(query.matching_strategy.into()); diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index beab9970b..517024c74 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -346,17 +346,24 @@ impl Index<'_> { query: Value, test: impl Fn(Value, StatusCode) + UnwindSafe + Clone, ) { - let (response, code) = self.search_post(query.clone()).await; - let t = test.clone(); - if let Err(e) = catch_unwind(move || t(response, code)) { - eprintln!("Error with post search"); - resume_unwind(e); - } + let post = self.search_post(query.clone()).await; + let query = yaup::to_string(&query).unwrap(); - let (response, code) = self.search_get(&query).await; - if let Err(e) = catch_unwind(move || test(response, code)) { - eprintln!("Error with get search"); - resume_unwind(e); + let get = self.search_get(&query).await; + + insta::allow_duplicates! { + let (response, code) = post; + let t = test.clone(); + if let Err(e) = catch_unwind(move || t(response, code)) { + eprintln!("Error with post search"); + resume_unwind(e); + } + + let (response, code) = get; + if let Err(e) = catch_unwind(move || test(response, code)) { + eprintln!("Error with get search"); + resume_unwind(e); + } } } diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index f314e8800..273472d46 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -963,3 +963,29 @@ async fn sort_unset_ranking_rule() { ) .await; } + +#[actix_rt::test] +async fn search_on_unknown_field() { + let server = Server::new().await; + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search( + json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_attributes_to_search_on" + } + "###); + }, + ) + .await; +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 045cfde2c..6a55c7569 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -5,6 +5,7 @@ mod errors; mod formatted; mod multi; mod pagination; +mod restrict_searchable; use once_cell::sync::Lazy; use serde_json::{json, Value}; diff --git a/meilisearch/tests/search/restrict_searchable.rs b/meilisearch/tests/search/restrict_searchable.rs new file mode 100644 index 000000000..f119acea5 --- /dev/null +++ b/meilisearch/tests/search/restrict_searchable.rs @@ -0,0 +1,267 @@ +use meili_snap::{json_string, snapshot}; +use once_cell::sync::Lazy; +use serde_json::{json, Value}; + +use crate::common::index::Index; +use crate::common::Server; + +async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> { + let index = server.index("test"); + + index.add_documents(documents.clone(), None).await; + index.wait_task(0).await; + index +} + +static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + }, + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + }]) +}); + +#[actix_rt::test] +async fn simple_search_on_title() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + // simple search should return 2 documents (ids: 2 and 3). + index + .search( + json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"].as_array().unwrap().len(), @"2"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn simple_prefix_search_on_title() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + // simple search should return 2 documents (ids: 2 and 3). + index + .search(json!({"q": "Captain Mar", "attributesToSearchOn": ["title"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"].as_array().unwrap().len(), @"2"); + }) + .await; +} + +#[actix_rt::test] +async fn simple_search_on_title_matching_strategy_all() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + // simple search matching strategy all should only return 1 document (ids: 2). + index + .search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "matchingStrategy": "all"}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"].as_array().unwrap().len(), @"1"); + }) + .await; +} + +#[actix_rt::test] +async fn simple_search_on_no_field() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + // simple search on no field shouldn't return any document. + index + .search(json!({"q": "Captain Marvel", "attributesToSearchOn": []}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"].as_array().unwrap().len(), @"0"); + }) + .await; +} + +#[actix_rt::test] +async fn word_ranking_rule_order() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + // Document 3 should appear before document 2. + index + .search( + json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), + @r###" + [ + { + "id": "3" + }, + { + "id": "2" + } + ] + "### + ); + }, + ) + .await; +} + +#[actix_rt::test] +async fn word_ranking_rule_order_exact_words() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + index.update_settings_typo_tolerance(json!({"disableOnWords": ["Captain", "Marvel"]})).await; + index.wait_task(1).await; + + // simple search should return 2 documents (ids: 2 and 3). + index + .search( + json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), + @r###" + [ + { + "id": "3" + }, + { + "id": "2" + } + ] + "### + ); + }, + ) + .await; +} + +#[actix_rt::test] +async fn typo_ranking_rule_order() { + let server = Server::new().await; + let index = index_with_documents( + &server, + &json!([ + { + "title": "Capitain Marivel", + "desc": "Captain Marvel", + "id": "1", + }, + { + "title": "Captain Marivel", + "desc": "a Shazam ersatz", + "id": "2", + }]), + ) + .await; + + // Document 2 should appear before document 1. + index + .search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), + @r###" + [ + { + "id": "2" + }, + { + "id": "1" + } + ] + "### + ); + }) + .await; +} + +#[actix_rt::test] +async fn attributes_ranking_rule_order() { + let server = Server::new().await; + let index = index_with_documents( + &server, + &json!([ + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "footer": "The story of Captain Marvel", + "id": "1", + }, + { + "title": "The Avengers", + "desc": "Captain Marvel is far from the earth", + "footer": "A super hero team", + "id": "2", + }]), + ) + .await; + + // Document 2 should appear before document 1. + index + .search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["desc", "footer"], "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), + @r###" + [ + { + "id": "2" + }, + { + "id": "1" + } + ] + "### + ); + }) + .await; +} + +#[actix_rt::test] +async fn exactness_ranking_rule_order() { + let server = Server::new().await; + let index = index_with_documents( + &server, + &json!([ + { + "title": "Captain Marvel", + "desc": "Captain Marivel", + "id": "1", + }, + { + "title": "Captain Marvel", + "desc": "CaptainMarvel", + "id": "2", + }]), + ) + .await; + + // Document 2 should appear before document 1. + index + .search(json!({"q": "Captain Marvel", "attributesToRetrieve": ["id"], "attributesToSearchOn": ["desc"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), + @r###" + [ + { + "id": "2" + }, + { + "id": "1" + } + ] + "### + ); + }) + .await; +} diff --git a/milli/src/error.rs b/milli/src/error.rs index 3df599b61..61597faf3 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -128,6 +128,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.", + .field, + .valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), + )] + InvalidSearchableAttribute { + field: String, + valid_fields: BTreeSet, + hidden_fields: bool, + }, #[error("{}", HeedError::BadOpenOptions)] InvalidLmdbOpenOptions, #[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")] diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index de2644e11..c54168a36 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -23,3 +23,9 @@ pub use self::roaring_bitmap_length::{ pub use self::script_language_codec::ScriptLanguageCodec; pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec}; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; + +pub trait BytesDecodeOwned { + type DItem; + + fn bytes_decode_owned(bytes: &[u8]) -> Option; +} diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 994e23b39..9ad2e9707 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -2,8 +2,11 @@ use std::borrow::Cow; use std::convert::TryInto; use std::mem::size_of; +use heed::BytesDecode; use roaring::RoaringBitmap; +use crate::heed_codec::BytesDecodeOwned; + pub struct BoRoaringBitmapCodec; impl BoRoaringBitmapCodec { @@ -13,7 +16,7 @@ impl BoRoaringBitmapCodec { } } -impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { +impl BytesDecode<'_> for BoRoaringBitmapCodec { type DItem = RoaringBitmap; fn bytes_decode(bytes: &[u8]) -> Option { @@ -28,6 +31,14 @@ impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { } } +impl BytesDecodeOwned for BoRoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + Self::bytes_decode(bytes) + } +} + impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { type EItem = RoaringBitmap; diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 01ce523ba..bf76287d8 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -5,6 +5,8 @@ use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; +use crate::heed_codec::BytesDecodeOwned; + /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able /// to determine the encoding used only by using the array of bytes length. @@ -103,6 +105,14 @@ impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { } } +impl BytesDecodeOwned for CboRoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + Self::deserialize_from(bytes).ok() + } +} + impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { type EItem = RoaringBitmap; diff --git a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs index 6cec0eb44..f982cc105 100644 --- a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs @@ -2,6 +2,8 @@ use std::borrow::Cow; use roaring::RoaringBitmap; +use crate::heed_codec::BytesDecodeOwned; + pub struct RoaringBitmapCodec; impl heed::BytesDecode<'_> for RoaringBitmapCodec { @@ -12,6 +14,14 @@ impl heed::BytesDecode<'_> for RoaringBitmapCodec { } } +impl BytesDecodeOwned for RoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + RoaringBitmap::deserialize_from(bytes).ok() + } +} + impl heed::BytesEncode<'_> for RoaringBitmapCodec { type EItem = RoaringBitmap; diff --git a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs index e749680a0..8fae60df7 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs @@ -1,11 +1,23 @@ use std::mem; +use heed::BytesDecode; + +use crate::heed_codec::BytesDecodeOwned; + pub struct BoRoaringBitmapLenCodec; -impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec { +impl BytesDecode<'_> for BoRoaringBitmapLenCodec { type DItem = u64; fn bytes_decode(bytes: &[u8]) -> Option { Some((bytes.len() / mem::size_of::()) as u64) } } + +impl BytesDecodeOwned for BoRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + Self::bytes_decode(bytes) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs index 4f728f1cd..5719a538a 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs @@ -1,11 +1,14 @@ use std::mem; +use heed::BytesDecode; + use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; +use crate::heed_codec::BytesDecodeOwned; pub struct CboRoaringBitmapLenCodec; -impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec { +impl BytesDecode<'_> for CboRoaringBitmapLenCodec { type DItem = u64; fn bytes_decode(bytes: &[u8]) -> Option { @@ -20,3 +23,11 @@ impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec { } } } + +impl BytesDecodeOwned for CboRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + Self::bytes_decode(bytes) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs index 4d266e413..a9b0506ff 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -3,6 +3,8 @@ use std::mem; use byteorder::{LittleEndian, ReadBytesExt}; +use crate::heed_codec::BytesDecodeOwned; + const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u16 = 12347; @@ -59,6 +61,14 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { } } +impl BytesDecodeOwned for RoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode_owned(bytes: &[u8]) -> Option { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + } +} + #[cfg(test)] mod tests { use heed::BytesEncode; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 970c0b7ab..dc25c0f23 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -29,6 +29,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, + searchable_attributes: Option<&'a [String]>, geo_strategy: new::GeoSortStrategy, terms_matching_strategy: TermsMatchingStrategy, scoring_strategy: ScoringStrategy, @@ -47,6 +48,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, + searchable_attributes: None, geo_strategy: new::GeoSortStrategy::default(), terms_matching_strategy: TermsMatchingStrategy::default(), scoring_strategy: Default::default(), @@ -82,6 +84,11 @@ impl<'a> Search<'a> { self } + pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> { + self.searchable_attributes = Some(searchable); + self + } + pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> { self.terms_matching_strategy = value; self @@ -117,6 +124,11 @@ impl<'a> Search<'a> { pub fn execute(&self) -> Result { let mut ctx = SearchContext::new(self.index, self.rtxn); + + if let Some(searchable_attributes) = self.searchable_attributes { + ctx.searchable_attributes(searchable_attributes)?; + } + let PartialSearchResult { located_query_terms, candidates, documents_ids, document_scores } = execute_search( &mut ctx, @@ -154,6 +166,7 @@ impl fmt::Debug for Search<'_> { offset, limit, sort_criteria, + searchable_attributes, geo_strategy: _, terms_matching_strategy, scoring_strategy, @@ -169,6 +182,7 @@ impl fmt::Debug for Search<'_> { .field("offset", offset) .field("limit", limit) .field("sort_criteria", sort_criteria) + .field("searchable_attributes", searchable_attributes) .field("terms_matching_strategy", terms_matching_strategy) .field("scoring_strategy", scoring_strategy) .field("exhaustive_number_hits", exhaustive_number_hits) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 90c604d72..e0a2ba3cf 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -4,12 +4,13 @@ use std::hash::Hash; use fxhash::FxHashMap; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode, Database, RoTxn}; +use heed::{BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; use super::interner::Interned; use super::Word; -use crate::heed_codec::StrBEU16Codec; +use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; +use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, }; @@ -22,50 +23,104 @@ use crate::{ #[derive(Default)] pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, + FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, + FxHashMap<(u8, Interned, Interned), Option>>, pub prefix_word_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option<&'ctx [u8]>>, - pub word_docids: FxHashMap, Option<&'ctx [u8]>>, - pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, - pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, - pub exact_word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + FxHashMap<(u8, Interned, Interned), Option>>, + pub word_docids: FxHashMap, Option>>, + pub exact_word_docids: FxHashMap, Option>>, + pub word_prefix_docids: FxHashMap, Option>>, + pub exact_word_prefix_docids: FxHashMap, Option>>, pub words_fst: Option>>, - pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, - pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option>>, + pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option>>, pub word_positions: FxHashMap, Vec>, pub word_prefix_positions: FxHashMap, Vec>, - pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, - pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option>>, + pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option>>, pub word_fids: FxHashMap, Vec>, pub word_prefix_fids: FxHashMap, Vec>, } impl<'ctx> DatabaseCache<'ctx> { - fn get_value<'v, K1, KC>( + fn get_value<'v, K1, KC, DC>( txn: &'ctx RoTxn, cache_key: K1, db_key: &'v KC::EItem, - cache: &mut FxHashMap>, + cache: &mut FxHashMap>>, db: Database, - ) -> Result> + ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, + DC: BytesDecodeOwned, { - let bitmap_ptr = match cache.entry(cache_key) { - Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), - Entry::Vacant(entry) => { - let bitmap_ptr = db.get(txn, db_key)?; - entry.insert(bitmap_ptr); - bitmap_ptr + if let Entry::Vacant(entry) = cache.entry(cache_key) { + let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); + entry.insert(bitmap_ptr); + } + + match cache.get(&cache_key).unwrap() { + Some(Cow::Borrowed(bytes)) => { + DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) } - }; - Ok(bitmap_ptr) + Some(Cow::Owned(bytes)) => { + DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) + } + None => Ok(None), + } + } + + fn get_value_from_keys<'v, K1, KC, DC>( + txn: &'ctx RoTxn, + cache_key: K1, + db_keys: &'v [KC::EItem], + cache: &mut FxHashMap>>, + db: Database, + merger: MergeFn, + ) -> Result> + where + K1: Copy + Eq + Hash, + KC: BytesEncode<'v>, + DC: BytesDecodeOwned, + KC::EItem: Sized, + { + if let Entry::Vacant(entry) = cache.entry(cache_key) { + let bitmap_ptr: Option> = match db_keys { + [] => None, + [key] => db.get(txn, key)?.map(Cow::Borrowed), + keys => { + let bitmaps = keys + .iter() + .filter_map(|key| db.get(txn, key).transpose()) + .map(|v| v.map(Cow::Borrowed)) + .collect::>, _>>()?; + + if bitmaps.is_empty() { + None + } else { + Some(merger(&[], &bitmaps[..])?) + } + } + }; + + entry.insert(bitmap_ptr); + } + + match cache.get(&cache_key).unwrap() { + Some(Cow::Borrowed(bytes)) => { + DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) + } + Some(Cow::Owned(bytes)) => { + DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) + } + None => Ok(None), + } } } + impl<'ctx> SearchContext<'ctx> { pub fn get_words_fst(&mut self) -> Result>> { if let Some(fst) = self.db_cache.words_fst.clone() { @@ -99,30 +154,41 @@ impl<'ctx> SearchContext<'ctx> { /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids(&mut self, word: Interned) -> Result> { - DatabaseCache::get_value( - self.txn, - word, - self.word_interner.get(word).as_str(), - &mut self.db_cache.word_docids, - self.index.word_docids.remap_data_type::(), - )? - .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(word).as_str(); + let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect(); + + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + word, + &keys[..], + &mut self.db_cache.word_docids, + self.index.word_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_docids( &mut self, word: Interned, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.exact_word_docids, self.index.exact_word_docids.remap_data_type::(), - )? - .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { @@ -150,30 +216,41 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - DatabaseCache::get_value( - self.txn, - prefix, - self.word_interner.get(prefix).as_str(), - &mut self.db_cache.word_prefix_docids, - self.index.word_prefix_docids.remap_data_type::(), - )? - .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + match &self.restricted_fids { + Some(restricted_fids) => { + let interned = self.word_interner.get(prefix).as_str(); + let keys: Vec<_> = restricted_fids.iter().map(|fid| (interned, *fid)).collect(); + + DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + self.txn, + prefix, + &keys[..], + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + merge_cbo_roaring_bitmaps, + ) + } + None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), + ), + } } fn get_db_exact_word_prefix_docids( &mut self, prefix: Interned, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.exact_word_prefix_docids, self.index.exact_word_prefix_docids.remap_data_type::(), - )? - .map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_pair_proximity_docids( @@ -182,7 +259,7 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, word1, word2), &( @@ -192,9 +269,7 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_pair_proximity_docids, self.index.word_pair_proximity_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_pair_proximity_docids_len( @@ -203,7 +278,7 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( self.txn, (proximity, word1, word2), &( @@ -213,11 +288,7 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_pair_proximity_docids, self.index.word_pair_proximity_docids.remap_data_type::(), - )? - .map(|bytes| { - CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()) - }) - .transpose() + ) } pub fn get_db_word_prefix_pair_proximity_docids( @@ -226,7 +297,7 @@ impl<'ctx> SearchContext<'ctx> { prefix2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, word1, prefix2), &( @@ -236,9 +307,7 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.word_prefix_pair_proximity_docids, self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_prefix_word_pair_proximity_docids( &mut self, @@ -246,7 +315,7 @@ impl<'ctx> SearchContext<'ctx> { right: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, left_prefix, right), &( @@ -256,9 +325,7 @@ impl<'ctx> SearchContext<'ctx> { ), &mut self.db_cache.prefix_word_pair_proximity_docids, self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_fid_docids( @@ -266,15 +333,18 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, fid: u16, ) -> Result> { - DatabaseCache::get_value( + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + return Ok(None); + } + + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, self.index.word_fid_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_prefix_fid_docids( @@ -282,15 +352,18 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, fid: u16, ) -> Result> { - DatabaseCache::get_value( + // if the requested fid isn't in the restricted list, return None. + if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + return Ok(None); + } + + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), &mut self.db_cache.word_prefix_fid_docids, self.index.word_prefix_fid_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { @@ -309,7 +382,7 @@ impl<'ctx> SearchContext<'ctx> { for result in remap_key_type { let ((_, fid), value) = result?; // filling other caches to avoid searching for them again - self.db_cache.word_fid_docids.insert((word, fid), Some(value)); + self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value))); fids.push(fid); } entry.insert(fids.clone()); @@ -335,7 +408,9 @@ impl<'ctx> SearchContext<'ctx> { for result in remap_key_type { let ((_, fid), value) = result?; // filling other caches to avoid searching for them again - self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value)); + self.db_cache + .word_prefix_fid_docids + .insert((word_prefix, fid), Some(Cow::Borrowed(value))); fids.push(fid); } entry.insert(fids.clone()); @@ -350,15 +425,13 @@ impl<'ctx> SearchContext<'ctx> { word: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, self.index.word_position_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_prefix_position_docids( @@ -366,15 +439,13 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (word_prefix, position), &(self.word_interner.get(word_prefix).as_str(), position), &mut self.db_cache.word_prefix_position_docids, self.index.word_prefix_position_docids.remap_data_type::(), - )? - .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) - .transpose() + ) } pub fn get_db_word_positions(&mut self, word: Interned) -> Result> { @@ -393,7 +464,9 @@ impl<'ctx> SearchContext<'ctx> { for result in remap_key_type { let ((_, position), value) = result?; // filling other caches to avoid searching for them again - self.db_cache.word_position_docids.insert((word, position), Some(value)); + self.db_cache + .word_position_docids + .insert((word, position), Some(Cow::Borrowed(value))); positions.push(position); } entry.insert(positions.clone()); @@ -424,7 +497,7 @@ impl<'ctx> SearchContext<'ctx> { // filling other caches to avoid searching for them again self.db_cache .word_prefix_position_docids - .insert((word_prefix, position), Some(value)); + .insert((word_prefix, position), Some(Cow::Borrowed(value))); positions.push(position); } entry.insert(positions.clone()); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 8bdcf077b..034b279ad 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -20,7 +20,7 @@ mod sort; #[cfg(test)] mod tests; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use bucket_sort::{bucket_sort, BucketSortOutput}; use charabia::TokenizerBuilder; @@ -46,6 +46,7 @@ use self::geo_sort::GeoSort; pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; +use crate::error::FieldIdMapMissingEntry; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::{ @@ -62,6 +63,7 @@ pub struct SearchContext<'ctx> { pub phrase_interner: DedupInterner, pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, + pub restricted_fids: Option>, } impl<'ctx> SearchContext<'ctx> { @@ -74,8 +76,66 @@ impl<'ctx> SearchContext<'ctx> { phrase_interner: <_>::default(), term_interner: <_>::default(), phrase_docids: <_>::default(), + restricted_fids: None, } } + + pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> { + let fids_map = self.index.fields_ids_map(self.txn)?; + let searchable_names = self.index.searchable_fields(self.txn)?; + + let mut restricted_fids = Vec::new(); + for field_name in searchable_attributes { + let searchable_contains_name = + searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); + let fid = match (fids_map.id(field_name), searchable_contains_name) { + // The Field id exist and the field is searchable + (Some(fid), Some(true)) | (Some(fid), None) => fid, + // The field is searchable but the Field id doesn't exist => Internal Error + (None, Some(true)) => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: field_name.to_string(), + process: "search", + } + .into()) + } + // The field is not searchable => User error + _otherwise => { + let mut valid_fields: BTreeSet<_> = + fids_map.names().map(String::from).collect(); + + // Filter by the searchable names + if let Some(sn) = searchable_names { + let searchable_names = sn.iter().map(|s| s.to_string()).collect(); + valid_fields = &valid_fields & &searchable_names; + } + + let searchable_count = valid_fields.len(); + + // Remove hidden fields + if let Some(dn) = self.index.displayed_fields(self.txn)? { + let displayable_names = dn.iter().map(|s| s.to_string()).collect(); + valid_fields = &valid_fields & &displayable_names; + } + + let hidden_fields = searchable_count > valid_fields.len(); + let field = field_name.to_string(); + return Err(UserError::InvalidSearchableAttribute { + field, + valid_fields, + hidden_fields, + } + .into()); + } + }; + + restricted_fids.push(fid); + } + + self.restricted_fids = Some(restricted_fids); + + Ok(()) + } } #[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)] diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 7a3fd1fd9..32584825b 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -4,7 +4,8 @@ pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDele pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ - DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId, + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn, }; pub use self::indexer_config::IndexerConfig; pub use self::prefix_word_pairs::{