Fix the indexing of the searchable

This commit is contained in:
Tamo 2024-05-07 17:56:40 +02:00
parent 4e4a1ddff7
commit 685f452fb2
12 changed files with 235 additions and 154 deletions

View File

@ -48,7 +48,7 @@ fn main() -> Result<(), Box<dyn Error>> {
let start = Instant::now();
let mut ctx = SearchContext::new(&index, &txn);
let mut ctx = SearchContext::new(&index, &txn)?;
let universe = filtered_universe(&ctx, &None)?;
let docs = execute_search(

View File

@ -25,4 +25,8 @@ impl FieldidsWeightsMap {
pub fn max_weight(&self) -> Option<Weight> {
self.map.values().copied().max()
}
pub fn ids<'a>(&'a self) -> impl Iterator<Item = FieldId> + 'a {
self.map.keys().copied()
}
}

View File

@ -28,7 +28,7 @@ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap,
GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec,
BEU16, BEU32, BEU64,
Weight, BEU16, BEU32, BEU64,
};
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -443,6 +443,27 @@ impl Index {
.unwrap_or_default())
}
pub fn searchable_fields_and_weights<'a>(
&self,
rtxn: &'a RoTxn,
) -> heed::Result<Vec<(Cow<'a, str>, FieldId, Weight)>> {
let fid_map = self.fields_ids_map(rtxn)?;
let weight_map = self.fieldids_weights_map(rtxn)?;
let searchable = self.searchable_fields(rtxn)?;
Ok(searchable
.into_iter()
.map(|field| {
// the searchable attributes are a subset of the field id map
let fid = fid_map.id(&field).unwrap();
// all the searchable fields have a weight
let weight = weight_map.weight(fid).unwrap();
(field, fid, weight)
})
.collect())
}
/* geo rtree */
/// Writes the provided `rtree` which associates coordinates to documents ids.
@ -605,9 +626,25 @@ impl Index {
pub(crate) fn put_all_searchable_fields_from_fields_ids_map(
&self,
wtxn: &mut RwTxn,
user_fields: &[&str],
user_fields: Option<&[&str]>,
fields_ids_map: &FieldsIdsMap,
) -> Result<()> {
// Special case if there is no user defined fields.
// Then the whole field id map is marked as searchable.
if user_fields.is_none() {
let mut weights = self.fieldids_weights_map(&wtxn)?;
let mut searchable = Vec::new();
for (weight, (fid, name)) in fields_ids_map.iter().enumerate() {
searchable.push(name);
weights.insert(fid, weight as u16);
}
self.put_searchable_fields(wtxn, &searchable)?;
self.put_fieldids_weights_map(wtxn, &weights)?;
return Ok(());
}
let user_fields = user_fields.unwrap();
// We can write the user defined searchable fields as-is.
self.put_user_defined_searchable_fields(wtxn, user_fields)?;
@ -617,13 +654,13 @@ impl Index {
// 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion.
// 2. Iterate over the user defined searchable fields.
// 3. If a user defined field is a subset of a field defined in the fields_ids_map
// (ie doggo.name is a subset of doggo) then we push it at the end of the fields.
let mut real_fields = user_fields.to_vec();
// (ie doggo.name is a subset of doggo) right after doggo and with the same weight.
let mut real_fields = Vec::new();
for (id, field_from_map) in fields_ids_map.iter() {
for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field)
&& !user_fields.contains(&field_from_map)
&& !real_fields.contains(&field_from_map)
{
real_fields.push(field_from_map);
@ -2427,6 +2464,14 @@ pub(crate) mod tests {
11 0
4 1
"###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
index
.add_documents(documents!([
@ -2442,6 +2487,16 @@ pub(crate) mod tests {
11 0
4 1
"###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 1 |
"###);
index.delete_documents(Default::default());
@ -2452,6 +2507,16 @@ pub(crate) mod tests {
11 0
4 1
"###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 1 |
"###);
index
.add_documents(documents!([
@ -2467,6 +2532,16 @@ pub(crate) mod tests {
11 0
4 1
"###);
db_snap!(index, fields_ids_map, @r###"
0 primary_key |
1 a |
"###);
db_snap!(index, searchable_fields, @r###"["primary_key", "a"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 1 |
"###);
let rtxn = index.read_txn().unwrap();
let search = Search::new(&rtxn, &index);

View File

@ -147,7 +147,7 @@ impl<'a> Search<'a> {
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn);
let ctx = SearchContext::new(self.index, self.rtxn)?;
filtered_universe(&ctx, &self.filter)
} else {
Ok(self.execute()?.candidates)
@ -155,7 +155,7 @@ impl<'a> Search<'a> {
}
pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn);
let mut ctx = SearchContext::new(self.index, self.rtxn)?;
if let Some(searchable_attributes) = self.searchable_attributes {
ctx.searchable_attributes(searchable_attributes)?;

View File

@ -159,58 +159,36 @@ impl<'ctx> SearchContext<'ctx> {
/// Retrieve or insert the given value in the `word_docids` database.
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
fn get_db_exact_word_docids(
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
let interned = self.word_interner.get(word).as_str();
let keys: Vec<_> =
self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids,
self.index.exact_word_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
@ -238,58 +216,36 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|fid| (interned, *fid)).collect();
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
self.searchable_fids.tolerant.iter().map(|(fid, _weight)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
fn get_db_exact_word_prefix_docids(
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids {
Some(restricted_fids) => {
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
restricted_fids.exact.iter().map(|fid| (interned, *fid)).collect();
let interned = self.word_interner.get(prefix).as_str();
let keys: Vec<_> =
self.searchable_fids.exact.iter().map(|(fid, _weight)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids,
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
),
}
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
merge_cbo_roaring_bitmaps,
)
}
pub fn get_db_word_pair_proximity_docids(
@ -465,8 +421,8 @@ impl<'ctx> SearchContext<'ctx> {
word: Interned<String>,
fid: u16,
) -> Result<Option<RoaringBitmap>> {
// if the requested fid isn't in the restricted list, return None.
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
// if the requested fid isn't in the list of searchable, return None.
if !self.searchable_fids.contains(&fid) {
return Ok(None);
}
@ -484,8 +440,8 @@ impl<'ctx> SearchContext<'ctx> {
word_prefix: Interned<String>,
fid: u16,
) -> Result<Option<RoaringBitmap>> {
// if the requested fid isn't in the restricted list, return None.
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
// if the requested fid isn't in the searchable list, return None.
if !self.searchable_fids.contains(&fid) {
return Ok(None);
}

View File

@ -258,7 +258,7 @@ pub(crate) mod tests {
fn matching_words() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn);
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize("split this world");

View File

@ -506,7 +506,7 @@ mod tests {
impl<'a> MatcherBuilder<'a> {
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
let mut ctx = SearchContext::new(index, rtxn);
let mut ctx = SearchContext::new(index, rtxn).unwrap();
let universe = filtered_universe(&ctx, &None).unwrap();
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx,

View File

@ -49,13 +49,12 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::error::FieldIdMapMissingEntry;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder;
use crate::{
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
UserError,
UserError, Weight,
};
/// A structure used throughout the execution of a search query.
@ -67,12 +66,25 @@ pub struct SearchContext<'ctx> {
pub phrase_interner: DedupInterner<Phrase>,
pub term_interner: Interner<QueryTerm>,
pub phrase_docids: PhraseDocIdsCache,
pub restricted_fids: Option<RestrictedFids>,
pub searchable_fids: SearchableFids,
}
impl<'ctx> SearchContext<'ctx> {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
Self {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
let searchable_fids = index.searchable_fields_and_weights(txn)?;
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
let mut exact = Vec::new();
let mut tolerant = Vec::new();
for (name, fid, weight) in searchable_fids {
if exact_attributes_ids.contains(&fid) {
exact.push((fid, weight));
} else {
tolerant.push((fid, weight));
}
}
Ok(Self {
index,
txn,
db_cache: <_>::default(),
@ -80,38 +92,32 @@ impl<'ctx> SearchContext<'ctx> {
phrase_interner: <_>::default(),
term_interner: <_>::default(),
phrase_docids: <_>::default(),
restricted_fids: None,
}
searchable_fids: SearchableFids { tolerant, exact },
})
}
pub fn searchable_attributes(&mut self, searchable_attributes: &'ctx [String]) -> Result<()> {
// TODO: TAMO continue here
pub fn searchable_attributes(&mut self, attributes_to_search_on: &'ctx [String]) -> Result<()> {
if attributes_to_search_on.contains(&String::from("*")) {
return Ok(());
}
let fids_map = self.index.fields_ids_map(self.txn)?;
let searchable_names = self.index.searchable_fields(self.txn)?;
let searchable_names = self.index.searchable_fields_and_weights(self.txn)?;
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
let mut restricted_fids = RestrictedFids::default();
let mut contains_wildcard = false;
for field_name in searchable_attributes {
if field_name == "*" {
contains_wildcard = true;
continue;
}
let searchable_contains_name = searchable_names.iter().any(|name| name == field_name);
let fid = match (fids_map.id(field_name), searchable_contains_name) {
let mut restricted_fids = SearchableFids::default();
for field_name in attributes_to_search_on {
let searchable_weight = searchable_names.iter().find(|(name, _, _)| name == field_name);
let (fid, weight) = match searchable_weight {
// The Field id exist and the field is searchable
(Some(fid), true) => fid,
// The field is searchable but the Field id doesn't exist => Internal Error
(None, true) => {
return Err(FieldIdMapMissingEntry::FieldName {
field_name: field_name.to_string(),
process: "search",
}
.into())
}
Some((_name, fid, weight)) => (*fid, *weight),
// The field is not searchable => User error
(_fid, false) => {
let (valid_fields, hidden_fields) =
self.index.remove_hidden_fields(self.txn, searchable_names)?;
None => {
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
self.txn,
searchable_names.iter().map(|(name, _, _)| name),
)?;
let field = field_name.to_string();
return Err(UserError::InvalidSearchableAttribute {
@ -124,13 +130,13 @@ impl<'ctx> SearchContext<'ctx> {
};
if exact_attributes_ids.contains(&fid) {
restricted_fids.exact.push(fid);
restricted_fids.exact.push((fid, weight));
} else {
restricted_fids.tolerant.push(fid);
restricted_fids.tolerant.push((fid, weight));
};
}
self.restricted_fids = (!contains_wildcard).then_some(restricted_fids);
self.searchable_fids = restricted_fids;
Ok(())
}
@ -152,14 +158,15 @@ impl Word {
}
#[derive(Debug, Clone, Default)]
pub struct RestrictedFids {
pub tolerant: Vec<FieldId>,
pub exact: Vec<FieldId>,
pub struct SearchableFids {
pub tolerant: Vec<(FieldId, Weight)>,
pub exact: Vec<(FieldId, Weight)>,
}
impl RestrictedFids {
impl SearchableFids {
pub fn contains(&self, fid: &FieldId) -> bool {
self.tolerant.contains(fid) || self.exact.contains(fid)
self.tolerant.iter().find(|(id, _)| id == fid).is_some()
|| self.exact.iter().find(|(id, _)| id == fid).is_some()
}
}

View File

@ -366,7 +366,7 @@ mod tests {
let tokens = tokenizer.tokenize(".");
let index = temp_index_with_documents();
let rtxn = index.read_txn()?;
let mut ctx = SearchContext::new(&index, &rtxn);
let mut ctx = SearchContext::new(&index, &rtxn)?;
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None)?;

View File

@ -1,5 +1,5 @@
use crate::index::tests::TempIndex;
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy};
fn create_index() -> TempIndex {
let index = TempIndex::new();
@ -131,6 +131,19 @@ fn test_attribute_fid_simple() {
#[test]
fn test_attribute_fid_ngrams() {
let index = create_index();
db_snap!(index, fields_ids_map, @r###"
0 title |
1 description |
2 plot |
3 id |
"###);
db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
1 1 |
2 2 |
"###);
let txn = index.read_txn().unwrap();

View File

@ -308,6 +308,25 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
}
snap
}
pub fn snap_fieldids_weights_map(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let weights_map = index.fieldids_weights_map(&rtxn).unwrap();
let mut snap = String::new();
writeln!(&mut snap, "fid weight").unwrap();
let mut field_ids: Vec<_> = weights_map.ids().collect();
field_ids.sort();
for field_id in field_ids {
let weight = weights_map.weight(field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {weight:<3} |").unwrap();
}
snap
}
pub fn snap_searchable_fields(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let searchable_fields = index.searchable_fields(&rtxn).unwrap();
format!("{searchable_fields:?}")
}
pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap();
@ -469,6 +488,12 @@ macro_rules! full_snap_of_db {
($index:ident, fields_ids_map) => {{
$crate::snapshot_tests::snap_fields_ids_map(&$index)
}};
($index:ident, fieldids_weights_map) => {{
$crate::snapshot_tests::snap_fieldids_weights_map(&$index)
}};
($index:ident, searchable_fields) => {{
$crate::snapshot_tests::snap_searchable_fields(&$index)
}};
($index:ident, geo_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index)
}};

View File

@ -496,7 +496,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn,
&names,
Some(&names),
&new_fields_ids_map,
)?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
@ -1228,18 +1228,19 @@ impl InnerIndexSettings {
// find and insert the new field ids
pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let searchable_fields = self
.user_defined_searchable_fields
.as_ref()
.map(|searchable| searchable.iter().map(|s| s.as_str()).collect::<Vec<_>>());
// in case new fields were introduced we're going to recreate the searchable fields.
if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() {
let searchable_fields =
searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
&searchable_fields,
&self.fields_ids_map,
)?;
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
self.searchable_fields_ids = searchable_fields_ids;
}
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
searchable_fields.as_deref(),
&self.fields_ids_map,
)?;
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
self.searchable_fields_ids = searchable_fields_ids;
Ok(())
}