From c5a60754848e4bb0c2e9bf87781d6584e550b00f Mon Sep 17 00:00:00 2001 From: many Date: Wed, 6 Oct 2021 12:11:07 +0200 Subject: [PATCH] Make max_position_per_attributes changable --- http-ui/src/main.rs | 8 ++++++++ .../extract/extract_docid_word_positions.rs | 5 ++++- milli/src/update/index_documents/extract/mod.rs | 4 ++++ milli/src/update/index_documents/mod.rs | 4 ++++ milli/src/update/settings.rs | 3 +++ milli/src/update/update_builder.rs | 8 ++++++++ 6 files changed, 31 insertions(+), 1 deletion(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 27fc138dd..652a88451 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -131,6 +131,11 @@ pub struct IndexerOpt { /// Number of parallel jobs for indexing, defaults to # of CPUs. #[structopt(long)] pub indexing_jobs: Option, + + /// Maximum relative position in an attribute for a word to be indexed. + /// Any value higher than 65535 will be clamped. + #[structopt(long)] + pub max_positions_per_attributes: Option, } struct Highlighter<'a, A> { @@ -346,6 +351,9 @@ async fn main() -> anyhow::Result<()> { if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level { update_builder.chunk_compression_level(chunk_compression_level); } + if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes { + update_builder.max_positions_per_attributes(max_pos_per_attributes); + } update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index df19125c6..fa1381412 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -23,7 +23,10 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, + max_positions_per_attributes: Option, ) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_positions_per_attributes = max_positions_per_attributes + .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); @@ -62,7 +65,7 @@ pub fn extract_docid_word_positions( if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); let tokens = process_tokens(analyzed.tokens()) - .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE); + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { let token = token.text().trim(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0406e8ef4..0f04418ed 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -42,6 +42,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id: FieldId, geo_field_id: Option, stop_words: Option>, + max_positions_per_attributes: Option, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -55,6 +56,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id, geo_field_id, &stop_words, + max_positions_per_attributes, ) }) .collect(); @@ -177,6 +179,7 @@ fn extract_documents_data( primary_key_id: FieldId, geo_field_id: Option, stop_words: &Option>, + max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, (grenad::Reader, grenad::Reader), @@ -206,6 +209,7 @@ fn extract_documents_data( indexer.clone(), searchable_fields, stop_words.as_ref(), + max_positions_per_attributes, )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8138c6191..92bcab0e9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -68,6 +68,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, facet_level_group_size: Option, facet_min_level_size: Option, words_prefix_threshold: Option, @@ -104,6 +105,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { update_method: IndexDocumentsMethod::ReplaceDocuments, autogenerate_docids: false, update_id, + max_positions_per_attributes: None, } } @@ -262,6 +264,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { primary_key_id, geo_field_id, stop_words, + self.max_positions_per_attributes, ) }); @@ -284,6 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, + max_positions_per_attributes: self.max_positions_per_attributes, update_id: self.update_id, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 4aa79f6e3..41c156676 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -69,6 +69,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, update_id: u64, searchable_fields: Setting>, @@ -108,6 +109,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, update_id, + max_positions_per_attributes: None, } } @@ -237,6 +239,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.thread_pool = self.thread_pool; + indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes; indexing_builder.execute_raw(output, &cb)?; Ok(()) diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 561c4bc50..20ec28e06 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -12,6 +12,7 @@ pub struct UpdateBuilder<'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, pub(crate) update_id: u64, } @@ -25,6 +26,7 @@ impl<'a> UpdateBuilder<'a> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, + max_positions_per_attributes: None, update_id, } } @@ -57,6 +59,10 @@ impl<'a> UpdateBuilder<'a> { self.thread_pool = Some(thread_pool); } + pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) { + self.max_positions_per_attributes = Some(max_positions_per_attributes); + } + pub fn clear_documents<'t, 'u, 'i>( self, wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -87,6 +93,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder } @@ -105,6 +112,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder }