mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
feat(lib): auto-batching
This commit is contained in:
parent
622c15e825
commit
c9a236b0af
28 changed files with 1181 additions and 777 deletions
|
@ -6,10 +6,10 @@ use anyhow::Context;
|
|||
use heed::{EnvOpenOptions, RoTxn};
|
||||
use indexmap::IndexMap;
|
||||
use milli::documents::DocumentBatchReader;
|
||||
use milli::update::{IndexDocumentsConfig, IndexerConfig};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::document_formats::read_ndjson;
|
||||
use crate::index::update_handler::UpdateHandler;
|
||||
use crate::index::updates::apply_settings_to_builder;
|
||||
|
||||
use super::error::Result;
|
||||
|
@ -85,7 +85,7 @@ impl Index {
|
|||
src: impl AsRef<Path>,
|
||||
dst: impl AsRef<Path>,
|
||||
size: usize,
|
||||
update_handler: &UpdateHandler,
|
||||
indexer_config: &IndexerConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
let dir_name = src
|
||||
.as_ref()
|
||||
|
@ -110,8 +110,7 @@ impl Index {
|
|||
let mut txn = index.write_txn()?;
|
||||
|
||||
// Apply settings first
|
||||
let builder = update_handler.update_builder();
|
||||
let mut builder = builder.settings(&mut txn, &index);
|
||||
let mut builder = milli::update::Settings::new(&mut txn, &index, indexer_config);
|
||||
|
||||
if let Some(primary_key) = primary_key {
|
||||
builder.set_primary_key(primary_key);
|
||||
|
@ -140,12 +139,16 @@ impl Index {
|
|||
|
||||
//If the document file is empty, we don't perform the document addition, to prevent
|
||||
//a primary key error to be thrown.
|
||||
if !documents_reader.is_empty() {
|
||||
let builder = update_handler
|
||||
.update_builder()
|
||||
.index_documents(&mut txn, &index);
|
||||
builder.execute(documents_reader, |_| ())?;
|
||||
}
|
||||
let config = IndexDocumentsConfig::default();
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
&mut txn,
|
||||
&index,
|
||||
indexer_config,
|
||||
config,
|
||||
|_| (),
|
||||
);
|
||||
builder.add_documents(documents_reader)?;
|
||||
builder.execute()?;
|
||||
}
|
||||
|
||||
txn.commit()?;
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::error::Error;
|
|||
use meilisearch_error::{internal_error, Code, ErrorCode};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::error::MilliError;
|
||||
use crate::{error::MilliError, update_file_store};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, IndexError>;
|
||||
|
||||
|
@ -23,7 +23,9 @@ internal_error!(
|
|||
IndexError: std::io::Error,
|
||||
heed::Error,
|
||||
fst::Error,
|
||||
serde_json::Error
|
||||
serde_json::Error,
|
||||
update_file_store::UpdateFileStoreError,
|
||||
milli::documents::Error
|
||||
);
|
||||
|
||||
impl ErrorCode for IndexError {
|
||||
|
|
|
@ -7,7 +7,7 @@ use std::sync::Arc;
|
|||
|
||||
use chrono::{DateTime, Utc};
|
||||
use heed::{EnvOpenOptions, RoTxn};
|
||||
use milli::update::Setting;
|
||||
use milli::update::{IndexerConfig, Setting};
|
||||
use milli::{obkv_to_json, FieldDistribution, FieldId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
|
@ -17,7 +17,6 @@ use crate::EnvSizer;
|
|||
|
||||
use super::error::IndexError;
|
||||
use super::error::Result;
|
||||
use super::update_handler::UpdateHandler;
|
||||
use super::{Checked, Settings};
|
||||
|
||||
pub type Document = Map<String, Value>;
|
||||
|
@ -68,7 +67,7 @@ pub struct Index {
|
|||
#[derivative(Debug = "ignore")]
|
||||
pub inner: Arc<milli::Index>,
|
||||
#[derivative(Debug = "ignore")]
|
||||
pub update_handler: Arc<UpdateHandler>,
|
||||
pub indexer_config: Arc<IndexerConfig>,
|
||||
}
|
||||
|
||||
impl Deref for Index {
|
||||
|
@ -84,7 +83,7 @@ impl Index {
|
|||
path: impl AsRef<Path>,
|
||||
size: usize,
|
||||
uuid: Uuid,
|
||||
update_handler: Arc<UpdateHandler>,
|
||||
update_handler: Arc<IndexerConfig>,
|
||||
) -> Result<Self> {
|
||||
log::debug!("opening index in {}", path.as_ref().display());
|
||||
create_dir_all(&path)?;
|
||||
|
@ -94,7 +93,7 @@ impl Index {
|
|||
Ok(Index {
|
||||
inner,
|
||||
uuid,
|
||||
update_handler,
|
||||
indexer_config: update_handler,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecke
|
|||
mod dump;
|
||||
pub mod error;
|
||||
mod search;
|
||||
pub mod update_handler;
|
||||
pub mod updates;
|
||||
|
||||
#[allow(clippy::module_inception)]
|
||||
|
@ -26,6 +25,7 @@ pub mod test {
|
|||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use milli::update::IndexerConfig;
|
||||
use milli::update::{DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod};
|
||||
use nelson::Mocker;
|
||||
use serde_json::{Map, Value};
|
||||
|
@ -33,7 +33,6 @@ pub mod test {
|
|||
|
||||
use super::error::Result;
|
||||
use super::index::Index;
|
||||
use super::update_handler::UpdateHandler;
|
||||
use super::{Checked, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings};
|
||||
use crate::update_file_store::UpdateFileStore;
|
||||
|
||||
|
@ -52,7 +51,7 @@ pub mod test {
|
|||
path: impl AsRef<Path>,
|
||||
size: usize,
|
||||
uuid: Uuid,
|
||||
update_handler: Arc<UpdateHandler>,
|
||||
update_handler: Arc<IndexerConfig>,
|
||||
) -> Result<Self> {
|
||||
let index = Index::open(path, size, uuid, update_handler)?;
|
||||
Ok(Self::Real(index))
|
||||
|
@ -62,7 +61,7 @@ pub mod test {
|
|||
src: impl AsRef<Path>,
|
||||
dst: impl AsRef<Path>,
|
||||
size: usize,
|
||||
update_handler: &UpdateHandler,
|
||||
update_handler: &IndexerConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
Index::load_dump(src, dst, size, update_handler)
|
||||
}
|
||||
|
@ -157,21 +156,18 @@ pub mod test {
|
|||
pub fn update_documents(
|
||||
&self,
|
||||
method: IndexDocumentsMethod,
|
||||
content_uuid: Uuid,
|
||||
primary_key: Option<String>,
|
||||
file_store: UpdateFileStore,
|
||||
contents: impl Iterator<Item = Uuid>,
|
||||
) -> Result<DocumentAdditionResult> {
|
||||
match self {
|
||||
MockIndex::Real(index) => {
|
||||
index.update_documents(method, content_uuid, primary_key, file_store)
|
||||
index.update_documents(method, primary_key, file_store, contents)
|
||||
}
|
||||
MockIndex::Mock(mocker) => unsafe {
|
||||
mocker.get("update_documents").call((
|
||||
method,
|
||||
content_uuid,
|
||||
primary_key,
|
||||
file_store,
|
||||
))
|
||||
mocker
|
||||
.get("update_documents")
|
||||
.call((method, primary_key, file_store, contents))
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -295,7 +295,7 @@ fn compute_value_matches<'a, A: AsRef<[u8]>>(
|
|||
let mut start = 0;
|
||||
for (word, token) in analyzed.reconstruct() {
|
||||
if token.is_word() {
|
||||
if let Some(length) = matcher.matches(token.text()) {
|
||||
if let Some(length) = matcher.matches(&token) {
|
||||
infos.push(MatchInfo { start, length });
|
||||
}
|
||||
}
|
||||
|
@ -486,18 +486,18 @@ fn format_fields<A: AsRef<[u8]>>(
|
|||
|
||||
/// trait to allow unit testing of `format_fields`
|
||||
trait Matcher {
|
||||
fn matches(&self, w: &str) -> Option<usize>;
|
||||
fn matches(&self, w: &Token) -> Option<usize>;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl Matcher for BTreeMap<&str, Option<usize>> {
|
||||
fn matches(&self, w: &str) -> Option<usize> {
|
||||
self.get(w).cloned().flatten()
|
||||
fn matches(&self, w: &Token) -> Option<usize> {
|
||||
self.get(w.text()).cloned().flatten()
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for MatchingWords {
|
||||
fn matches(&self, w: &str) -> Option<usize> {
|
||||
fn matches(&self, w: &Token) -> Option<usize> {
|
||||
self.matching_bytes(w)
|
||||
}
|
||||
}
|
||||
|
@ -579,7 +579,7 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
|
|||
let mut tokens = analyzed.reconstruct().peekable();
|
||||
|
||||
while let Some((word, token)) =
|
||||
tokens.next_if(|(_, token)| matcher.matches(token.text()).is_none())
|
||||
tokens.next_if(|(_, token)| matcher.matches(token).is_none())
|
||||
{
|
||||
buffer.push((word, token));
|
||||
}
|
||||
|
@ -623,7 +623,7 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
|
|||
// Check if we need to do highlighting or computed matches before calling
|
||||
// Matcher::match since the call is expensive.
|
||||
if format_options.highlight && token.is_word() {
|
||||
if let Some(length) = matcher.matches(token.text()) {
|
||||
if let Some(length) = matcher.matches(&token) {
|
||||
match word.get(..length).zip(word.get(length..)) {
|
||||
Some((head, tail)) => {
|
||||
out.push_str(&self.marks.0);
|
||||
|
@ -653,7 +653,7 @@ fn parse_filter(facets: &Value) -> Result<Option<Filter>> {
|
|||
match facets {
|
||||
Value::String(expr) => {
|
||||
let condition = Filter::from_str(expr)?;
|
||||
Ok(Some(condition))
|
||||
Ok(condition)
|
||||
}
|
||||
Value::Array(arr) => parse_filter_array(arr),
|
||||
v => Err(FacetError::InvalidExpression(&["Array"], v.clone()).into()),
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
use milli::update::UpdateBuilder;
|
||||
use milli::CompressionType;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use crate::options::IndexerOpts;
|
||||
|
||||
pub struct UpdateHandler {
|
||||
max_nb_chunks: Option<usize>,
|
||||
chunk_compression_level: Option<u32>,
|
||||
thread_pool: ThreadPool,
|
||||
log_frequency: usize,
|
||||
max_memory: Option<usize>,
|
||||
chunk_compression_type: CompressionType,
|
||||
}
|
||||
|
||||
impl UpdateHandler {
|
||||
pub fn new(opt: &IndexerOpts) -> anyhow::Result<Self> {
|
||||
let thread_pool = rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(opt.indexing_jobs.unwrap_or(num_cpus::get() / 2))
|
||||
.build()?;
|
||||
|
||||
Ok(Self {
|
||||
max_nb_chunks: opt.max_nb_chunks,
|
||||
chunk_compression_level: opt.chunk_compression_level,
|
||||
thread_pool,
|
||||
log_frequency: opt.log_every_n,
|
||||
max_memory: opt.max_memory.map(|m| m.get_bytes() as usize),
|
||||
chunk_compression_type: opt.chunk_compression_type,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn update_builder(&self) -> UpdateBuilder {
|
||||
// We prepare the update by using the update builder.
|
||||
let mut update_builder = UpdateBuilder::new();
|
||||
if let Some(max_nb_chunks) = self.max_nb_chunks {
|
||||
update_builder.max_nb_chunks(max_nb_chunks);
|
||||
}
|
||||
if let Some(chunk_compression_level) = self.chunk_compression_level {
|
||||
update_builder.chunk_compression_level(chunk_compression_level);
|
||||
}
|
||||
update_builder.thread_pool(&self.thread_pool);
|
||||
update_builder.log_every_n(self.log_frequency);
|
||||
if let Some(max_memory) = self.max_memory {
|
||||
update_builder.max_memory(max_memory);
|
||||
}
|
||||
update_builder.chunk_compression_type(self.chunk_compression_type);
|
||||
update_builder
|
||||
}
|
||||
}
|
|
@ -5,7 +5,8 @@ use std::num::NonZeroUsize;
|
|||
use log::{debug, info, trace};
|
||||
use milli::documents::DocumentBatchReader;
|
||||
use milli::update::{
|
||||
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod, Setting,
|
||||
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
Setting,
|
||||
};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use uuid::Uuid;
|
||||
|
@ -178,7 +179,7 @@ impl Index {
|
|||
txn: &mut heed::RwTxn<'a, 'b>,
|
||||
primary_key: String,
|
||||
) -> Result<IndexMeta> {
|
||||
let mut builder = self.update_handler.update_builder().settings(txn, self);
|
||||
let mut builder = milli::update::Settings::new(txn, self, self.indexer_config.as_ref());
|
||||
builder.set_primary_key(primary_key);
|
||||
builder.execute(|_| ())?;
|
||||
let meta = IndexMeta::new_txn(self, txn)?;
|
||||
|
@ -197,10 +198,7 @@ impl Index {
|
|||
/// Deletes `ids` from the index, and returns how many documents were deleted.
|
||||
pub fn delete_documents(&self, ids: &[String]) -> Result<DocumentDeletionResult> {
|
||||
let mut txn = self.write_txn()?;
|
||||
let mut builder = self
|
||||
.update_handler
|
||||
.update_builder()
|
||||
.delete_documents(&mut txn, self)?;
|
||||
let mut builder = milli::update::DeleteDocuments::new(&mut txn, self)?;
|
||||
|
||||
// We ignore unexisting document ids
|
||||
ids.iter().for_each(|id| {
|
||||
|
@ -216,11 +214,7 @@ impl Index {
|
|||
|
||||
pub fn clear_documents(&self) -> Result<()> {
|
||||
let mut txn = self.write_txn()?;
|
||||
self.update_handler
|
||||
.update_builder()
|
||||
.clear_documents(&mut txn, self)
|
||||
.execute()?;
|
||||
|
||||
milli::update::ClearDocuments::new(&mut txn, self).execute()?;
|
||||
txn.commit()?;
|
||||
|
||||
Ok(())
|
||||
|
@ -229,9 +223,9 @@ impl Index {
|
|||
pub fn update_documents(
|
||||
&self,
|
||||
method: IndexDocumentsMethod,
|
||||
content_uuid: Uuid,
|
||||
primary_key: Option<String>,
|
||||
file_store: UpdateFileStore,
|
||||
contents: impl IntoIterator<Item = Uuid>,
|
||||
) -> Result<DocumentAdditionResult> {
|
||||
trace!("performing document addition");
|
||||
let mut txn = self.write_txn()?;
|
||||
|
@ -242,17 +236,27 @@ impl Index {
|
|||
}
|
||||
}
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: method,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let indexing_callback = |indexing_step| debug!("update: {:?}", indexing_step);
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
&mut txn,
|
||||
self,
|
||||
self.indexer_config.as_ref(),
|
||||
config,
|
||||
indexing_callback,
|
||||
);
|
||||
|
||||
let content_file = file_store.get_update(content_uuid).unwrap();
|
||||
let reader = DocumentBatchReader::from_reader(content_file).unwrap();
|
||||
for content_uuid in contents.into_iter() {
|
||||
let content_file = file_store.get_update(content_uuid)?;
|
||||
let reader = DocumentBatchReader::from_reader(content_file)?;
|
||||
builder.add_documents(reader)?;
|
||||
}
|
||||
|
||||
let mut builder = self
|
||||
.update_handler
|
||||
.update_builder()
|
||||
.index_documents(&mut txn, self);
|
||||
builder.index_documents_method(method);
|
||||
let addition = builder.execute(reader, indexing_callback)?;
|
||||
let addition = builder.execute()?;
|
||||
|
||||
txn.commit()?;
|
||||
|
||||
|
@ -264,10 +268,8 @@ impl Index {
|
|||
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
|
||||
// We must use the write transaction of the update here.
|
||||
let mut txn = self.write_txn()?;
|
||||
let mut builder = self
|
||||
.update_handler
|
||||
.update_builder()
|
||||
.settings(&mut txn, self);
|
||||
let mut builder =
|
||||
milli::update::Settings::new(&mut txn, self, self.indexer_config.as_ref());
|
||||
|
||||
apply_settings_to_builder(settings, &mut builder);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue