mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Make max_position_per_attributes changable
This commit is contained in:
parent
360c5ff3df
commit
c5a6075484
@ -131,6 +131,11 @@ pub struct IndexerOpt {
|
|||||||
/// Number of parallel jobs for indexing, defaults to # of CPUs.
|
/// Number of parallel jobs for indexing, defaults to # of CPUs.
|
||||||
#[structopt(long)]
|
#[structopt(long)]
|
||||||
pub indexing_jobs: Option<usize>,
|
pub indexing_jobs: Option<usize>,
|
||||||
|
|
||||||
|
/// Maximum relative position in an attribute for a word to be indexed.
|
||||||
|
/// Any value higher than 65535 will be clamped.
|
||||||
|
#[structopt(long)]
|
||||||
|
pub max_positions_per_attributes: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Highlighter<'a, A> {
|
struct Highlighter<'a, A> {
|
||||||
@ -346,6 +351,9 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level {
|
if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level {
|
||||||
update_builder.chunk_compression_level(chunk_compression_level);
|
update_builder.chunk_compression_level(chunk_compression_level);
|
||||||
}
|
}
|
||||||
|
if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes {
|
||||||
|
update_builder.max_positions_per_attributes(max_pos_per_attributes);
|
||||||
|
}
|
||||||
update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap());
|
update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap());
|
||||||
update_builder.log_every_n(indexer_opt_cloned.log_every_n);
|
update_builder.log_every_n(indexer_opt_cloned.log_every_n);
|
||||||
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
|
update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize);
|
||||||
|
@ -23,7 +23,10 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
stop_words: Option<&fst::Set<&[u8]>>,
|
||||||
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
|
) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
|
||||||
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
@ -62,7 +65,7 @@ pub fn extract_docid_word_positions<R: io::Read>(
|
|||||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||||
let analyzed = analyzer.analyze(field);
|
let analyzed = analyzer.analyze(field);
|
||||||
let tokens = process_tokens(analyzed.tokens())
|
let tokens = process_tokens(analyzed.tokens())
|
||||||
.take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE);
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
let token = token.text().trim();
|
let token = token.text().trim();
|
||||||
|
@ -42,6 +42,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_field_id: Option<FieldId>,
|
geo_field_id: Option<FieldId>,
|
||||||
stop_words: Option<fst::Set<&[u8]>>,
|
stop_words: Option<fst::Set<&[u8]>>,
|
||||||
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
@ -55,6 +56,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_field_id,
|
geo_field_id,
|
||||||
&stop_words,
|
&stop_words,
|
||||||
|
max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
@ -177,6 +179,7 @@ fn extract_documents_data(
|
|||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_field_id: Option<FieldId>,
|
geo_field_id: Option<FieldId>,
|
||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<&[u8]>>,
|
||||||
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||||
@ -206,6 +209,7 @@ fn extract_documents_data(
|
|||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
stop_words.as_ref(),
|
stop_words.as_ref(),
|
||||||
|
max_positions_per_attributes,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// send documents_ids to DB writer
|
// send documents_ids to DB writer
|
||||||
|
@ -68,6 +68,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||||
|
pub(crate) max_positions_per_attributes: Option<u32>,
|
||||||
facet_level_group_size: Option<NonZeroUsize>,
|
facet_level_group_size: Option<NonZeroUsize>,
|
||||||
facet_min_level_size: Option<NonZeroUsize>,
|
facet_min_level_size: Option<NonZeroUsize>,
|
||||||
words_prefix_threshold: Option<u32>,
|
words_prefix_threshold: Option<u32>,
|
||||||
@ -104,6 +105,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||||
autogenerate_docids: false,
|
autogenerate_docids: false,
|
||||||
update_id,
|
update_id,
|
||||||
|
max_positions_per_attributes: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,6 +264,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_field_id,
|
geo_field_id,
|
||||||
stop_words,
|
stop_words,
|
||||||
|
self.max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -284,6 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
chunk_compression_type: self.chunk_compression_type,
|
chunk_compression_type: self.chunk_compression_type,
|
||||||
chunk_compression_level: self.chunk_compression_level,
|
chunk_compression_level: self.chunk_compression_level,
|
||||||
thread_pool: self.thread_pool,
|
thread_pool: self.thread_pool,
|
||||||
|
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||||
update_id: self.update_id,
|
update_id: self.update_id,
|
||||||
};
|
};
|
||||||
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
|
let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?;
|
||||||
|
@ -69,6 +69,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||||
|
pub(crate) max_positions_per_attributes: Option<u32>,
|
||||||
update_id: u64,
|
update_id: u64,
|
||||||
|
|
||||||
searchable_fields: Setting<Vec<String>>,
|
searchable_fields: Setting<Vec<String>>,
|
||||||
@ -108,6 +109,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
synonyms: Setting::NotSet,
|
synonyms: Setting::NotSet,
|
||||||
primary_key: Setting::NotSet,
|
primary_key: Setting::NotSet,
|
||||||
update_id,
|
update_id,
|
||||||
|
max_positions_per_attributes: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,6 +239,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
indexing_builder.chunk_compression_type = self.chunk_compression_type;
|
indexing_builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
indexing_builder.chunk_compression_level = self.chunk_compression_level;
|
indexing_builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
indexing_builder.thread_pool = self.thread_pool;
|
indexing_builder.thread_pool = self.thread_pool;
|
||||||
|
indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes;
|
||||||
indexing_builder.execute_raw(output, &cb)?;
|
indexing_builder.execute_raw(output, &cb)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -12,6 +12,7 @@ pub struct UpdateBuilder<'a> {
|
|||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
pub(crate) thread_pool: Option<&'a ThreadPool>,
|
||||||
|
pub(crate) max_positions_per_attributes: Option<u32>,
|
||||||
pub(crate) update_id: u64,
|
pub(crate) update_id: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -25,6 +26,7 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
chunk_compression_type: CompressionType::None,
|
chunk_compression_type: CompressionType::None,
|
||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
thread_pool: None,
|
thread_pool: None,
|
||||||
|
max_positions_per_attributes: None,
|
||||||
update_id,
|
update_id,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -57,6 +59,10 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
self.thread_pool = Some(thread_pool);
|
self.thread_pool = Some(thread_pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) {
|
||||||
|
self.max_positions_per_attributes = Some(max_positions_per_attributes);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn clear_documents<'t, 'u, 'i>(
|
pub fn clear_documents<'t, 'u, 'i>(
|
||||||
self,
|
self,
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -87,6 +93,7 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
builder.thread_pool = self.thread_pool;
|
builder.thread_pool = self.thread_pool;
|
||||||
|
builder.max_positions_per_attributes = self.max_positions_per_attributes;
|
||||||
|
|
||||||
builder
|
builder
|
||||||
}
|
}
|
||||||
@ -105,6 +112,7 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
builder.thread_pool = self.thread_pool;
|
builder.thread_pool = self.thread_pool;
|
||||||
|
builder.max_positions_per_attributes = self.max_positions_per_attributes;
|
||||||
|
|
||||||
builder
|
builder
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user