diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2bb6a50a1..fbe756ac6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,8 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.3" diff --git a/milli/src/error.rs b/milli/src/error.rs index c817f64fa..d3f0a179f 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("{}", HeedError::BadOpenOptions)] + InvalidLmdbOpenOptions, #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] SortRankingRuleMissing, #[error("The database file is in an invalid state.")] @@ -244,6 +246,7 @@ impl From for Error { HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), + HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d1f030fdd..f5e04435d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -278,27 +278,30 @@ where let stop_words = self.index.stop_words(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + let pool_params = GrenadParameters { + chunk_compression_type: self.indexer_config.chunk_compression_type, + chunk_compression_level: self.indexer_config.chunk_compression_level, + max_memory: self.indexer_config.max_memory, + max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. + }; + let documents_chunk_size = + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; + // Run extraction pipeline in parallel. pool.install(|| { - let params = GrenadParameters { - chunk_compression_type: self.indexer_config.chunk_compression_type, - chunk_compression_level: self.indexer_config.chunk_compression_level, - max_memory: self.indexer_config.max_memory, - max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. - }; - // split obkv file into several chunks let original_chunk_iter = grenad_obkv_into_chunks( original_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); // split obkv file into several chunks let flattened_chunk_iter = grenad_obkv_into_chunks( flattened_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); let result = original_chunk_iter @@ -308,14 +311,14 @@ where extract::data_from_obkv_documents( original_chunk, flattened_chunk, - params, + pool_params, lmdb_writer_sx.clone(), searchable_fields, faceted_fields, primary_key_id, geo_fields_ids, stop_words, - self.indexer_config.max_positions_per_attributes, + max_positions_per_attributes, exact_attributes, ) });