feat(lib): auto-batching

This commit is contained in:
mpostma 2022-01-19 11:21:19 +01:00 committed by ad hoc
parent 622c15e825
commit c9a236b0af
No known key found for this signature in database
GPG key ID: 4F00A782990CC643
28 changed files with 1181 additions and 777 deletions

View file

@ -6,10 +6,10 @@ use anyhow::Context;
use heed::{EnvOpenOptions, RoTxn};
use indexmap::IndexMap;
use milli::documents::DocumentBatchReader;
use milli::update::{IndexDocumentsConfig, IndexerConfig};
use serde::{Deserialize, Serialize};
use crate::document_formats::read_ndjson;
use crate::index::update_handler::UpdateHandler;
use crate::index::updates::apply_settings_to_builder;
use super::error::Result;
@ -85,7 +85,7 @@ impl Index {
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
update_handler: &UpdateHandler,
indexer_config: &IndexerConfig,
) -> anyhow::Result<()> {
let dir_name = src
.as_ref()
@ -110,8 +110,7 @@ impl Index {
let mut txn = index.write_txn()?;
// Apply settings first
let builder = update_handler.update_builder();
let mut builder = builder.settings(&mut txn, &index);
let mut builder = milli::update::Settings::new(&mut txn, &index, indexer_config);
if let Some(primary_key) = primary_key {
builder.set_primary_key(primary_key);
@ -140,12 +139,16 @@ impl Index {
//If the document file is empty, we don't perform the document addition, to prevent
//a primary key error to be thrown.
if !documents_reader.is_empty() {
let builder = update_handler
.update_builder()
.index_documents(&mut txn, &index);
builder.execute(documents_reader, |_| ())?;
}
let config = IndexDocumentsConfig::default();
let mut builder = milli::update::IndexDocuments::new(
&mut txn,
&index,
indexer_config,
config,
|_| (),
);
builder.add_documents(documents_reader)?;
builder.execute()?;
}
txn.commit()?;

View file

@ -3,7 +3,7 @@ use std::error::Error;
use meilisearch_error::{internal_error, Code, ErrorCode};
use serde_json::Value;
use crate::error::MilliError;
use crate::{error::MilliError, update_file_store};
pub type Result<T> = std::result::Result<T, IndexError>;
@ -23,7 +23,9 @@ internal_error!(
IndexError: std::io::Error,
heed::Error,
fst::Error,
serde_json::Error
serde_json::Error,
update_file_store::UpdateFileStoreError,
milli::documents::Error
);
impl ErrorCode for IndexError {

View file

@ -7,7 +7,7 @@ use std::sync::Arc;
use chrono::{DateTime, Utc};
use heed::{EnvOpenOptions, RoTxn};
use milli::update::Setting;
use milli::update::{IndexerConfig, Setting};
use milli::{obkv_to_json, FieldDistribution, FieldId};
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
@ -17,7 +17,6 @@ use crate::EnvSizer;
use super::error::IndexError;
use super::error::Result;
use super::update_handler::UpdateHandler;
use super::{Checked, Settings};
pub type Document = Map<String, Value>;
@ -68,7 +67,7 @@ pub struct Index {
#[derivative(Debug = "ignore")]
pub inner: Arc<milli::Index>,
#[derivative(Debug = "ignore")]
pub update_handler: Arc<UpdateHandler>,
pub indexer_config: Arc<IndexerConfig>,
}
impl Deref for Index {
@ -84,7 +83,7 @@ impl Index {
path: impl AsRef<Path>,
size: usize,
uuid: Uuid,
update_handler: Arc<UpdateHandler>,
update_handler: Arc<IndexerConfig>,
) -> Result<Self> {
log::debug!("opening index in {}", path.as_ref().display());
create_dir_all(&path)?;
@ -94,7 +93,7 @@ impl Index {
Ok(Index {
inner,
uuid,
update_handler,
indexer_config: update_handler,
})
}

View file

@ -4,7 +4,6 @@ pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecke
mod dump;
pub mod error;
mod search;
pub mod update_handler;
pub mod updates;
#[allow(clippy::module_inception)]
@ -26,6 +25,7 @@ pub mod test {
use std::path::PathBuf;
use std::sync::Arc;
use milli::update::IndexerConfig;
use milli::update::{DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod};
use nelson::Mocker;
use serde_json::{Map, Value};
@ -33,7 +33,6 @@ pub mod test {
use super::error::Result;
use super::index::Index;
use super::update_handler::UpdateHandler;
use super::{Checked, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings};
use crate::update_file_store::UpdateFileStore;
@ -52,7 +51,7 @@ pub mod test {
path: impl AsRef<Path>,
size: usize,
uuid: Uuid,
update_handler: Arc<UpdateHandler>,
update_handler: Arc<IndexerConfig>,
) -> Result<Self> {
let index = Index::open(path, size, uuid, update_handler)?;
Ok(Self::Real(index))
@ -62,7 +61,7 @@ pub mod test {
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
update_handler: &UpdateHandler,
update_handler: &IndexerConfig,
) -> anyhow::Result<()> {
Index::load_dump(src, dst, size, update_handler)
}
@ -157,21 +156,18 @@ pub mod test {
pub fn update_documents(
&self,
method: IndexDocumentsMethod,
content_uuid: Uuid,
primary_key: Option<String>,
file_store: UpdateFileStore,
contents: impl Iterator<Item = Uuid>,
) -> Result<DocumentAdditionResult> {
match self {
MockIndex::Real(index) => {
index.update_documents(method, content_uuid, primary_key, file_store)
index.update_documents(method, primary_key, file_store, contents)
}
MockIndex::Mock(mocker) => unsafe {
mocker.get("update_documents").call((
method,
content_uuid,
primary_key,
file_store,
))
mocker
.get("update_documents")
.call((method, primary_key, file_store, contents))
},
}
}

View file

@ -295,7 +295,7 @@ fn compute_value_matches<'a, A: AsRef<[u8]>>(
let mut start = 0;
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
if let Some(length) = matcher.matches(token.text()) {
if let Some(length) = matcher.matches(&token) {
infos.push(MatchInfo { start, length });
}
}
@ -486,18 +486,18 @@ fn format_fields<A: AsRef<[u8]>>(
/// trait to allow unit testing of `format_fields`
trait Matcher {
fn matches(&self, w: &str) -> Option<usize>;
fn matches(&self, w: &Token) -> Option<usize>;
}
#[cfg(test)]
impl Matcher for BTreeMap<&str, Option<usize>> {
fn matches(&self, w: &str) -> Option<usize> {
self.get(w).cloned().flatten()
fn matches(&self, w: &Token) -> Option<usize> {
self.get(w.text()).cloned().flatten()
}
}
impl Matcher for MatchingWords {
fn matches(&self, w: &str) -> Option<usize> {
fn matches(&self, w: &Token) -> Option<usize> {
self.matching_bytes(w)
}
}
@ -579,7 +579,7 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
let mut tokens = analyzed.reconstruct().peekable();
while let Some((word, token)) =
tokens.next_if(|(_, token)| matcher.matches(token.text()).is_none())
tokens.next_if(|(_, token)| matcher.matches(token).is_none())
{
buffer.push((word, token));
}
@ -623,7 +623,7 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
// Check if we need to do highlighting or computed matches before calling
// Matcher::match since the call is expensive.
if format_options.highlight && token.is_word() {
if let Some(length) = matcher.matches(token.text()) {
if let Some(length) = matcher.matches(&token) {
match word.get(..length).zip(word.get(length..)) {
Some((head, tail)) => {
out.push_str(&self.marks.0);
@ -653,7 +653,7 @@ fn parse_filter(facets: &Value) -> Result<Option<Filter>> {
match facets {
Value::String(expr) => {
let condition = Filter::from_str(expr)?;
Ok(Some(condition))
Ok(condition)
}
Value::Array(arr) => parse_filter_array(arr),
v => Err(FacetError::InvalidExpression(&["Array"], v.clone()).into()),

View file

@ -1,49 +0,0 @@
use milli::update::UpdateBuilder;
use milli::CompressionType;
use rayon::ThreadPool;
use crate::options::IndexerOpts;
pub struct UpdateHandler {
max_nb_chunks: Option<usize>,
chunk_compression_level: Option<u32>,
thread_pool: ThreadPool,
log_frequency: usize,
max_memory: Option<usize>,
chunk_compression_type: CompressionType,
}
impl UpdateHandler {
pub fn new(opt: &IndexerOpts) -> anyhow::Result<Self> {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(opt.indexing_jobs.unwrap_or(num_cpus::get() / 2))
.build()?;
Ok(Self {
max_nb_chunks: opt.max_nb_chunks,
chunk_compression_level: opt.chunk_compression_level,
thread_pool,
log_frequency: opt.log_every_n,
max_memory: opt.max_memory.map(|m| m.get_bytes() as usize),
chunk_compression_type: opt.chunk_compression_type,
})
}
pub fn update_builder(&self) -> UpdateBuilder {
// We prepare the update by using the update builder.
let mut update_builder = UpdateBuilder::new();
if let Some(max_nb_chunks) = self.max_nb_chunks {
update_builder.max_nb_chunks(max_nb_chunks);
}
if let Some(chunk_compression_level) = self.chunk_compression_level {
update_builder.chunk_compression_level(chunk_compression_level);
}
update_builder.thread_pool(&self.thread_pool);
update_builder.log_every_n(self.log_frequency);
if let Some(max_memory) = self.max_memory {
update_builder.max_memory(max_memory);
}
update_builder.chunk_compression_type(self.chunk_compression_type);
update_builder
}
}

View file

@ -5,7 +5,8 @@ use std::num::NonZeroUsize;
use log::{debug, info, trace};
use milli::documents::DocumentBatchReader;
use milli::update::{
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod, Setting,
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
Setting,
};
use serde::{Deserialize, Serialize, Serializer};
use uuid::Uuid;
@ -178,7 +179,7 @@ impl Index {
txn: &mut heed::RwTxn<'a, 'b>,
primary_key: String,
) -> Result<IndexMeta> {
let mut builder = self.update_handler.update_builder().settings(txn, self);
let mut builder = milli::update::Settings::new(txn, self, self.indexer_config.as_ref());
builder.set_primary_key(primary_key);
builder.execute(|_| ())?;
let meta = IndexMeta::new_txn(self, txn)?;
@ -197,10 +198,7 @@ impl Index {
/// Deletes `ids` from the index, and returns how many documents were deleted.
pub fn delete_documents(&self, ids: &[String]) -> Result<DocumentDeletionResult> {
let mut txn = self.write_txn()?;
let mut builder = self
.update_handler
.update_builder()
.delete_documents(&mut txn, self)?;
let mut builder = milli::update::DeleteDocuments::new(&mut txn, self)?;
// We ignore unexisting document ids
ids.iter().for_each(|id| {
@ -216,11 +214,7 @@ impl Index {
pub fn clear_documents(&self) -> Result<()> {
let mut txn = self.write_txn()?;
self.update_handler
.update_builder()
.clear_documents(&mut txn, self)
.execute()?;
milli::update::ClearDocuments::new(&mut txn, self).execute()?;
txn.commit()?;
Ok(())
@ -229,9 +223,9 @@ impl Index {
pub fn update_documents(
&self,
method: IndexDocumentsMethod,
content_uuid: Uuid,
primary_key: Option<String>,
file_store: UpdateFileStore,
contents: impl IntoIterator<Item = Uuid>,
) -> Result<DocumentAdditionResult> {
trace!("performing document addition");
let mut txn = self.write_txn()?;
@ -242,17 +236,27 @@ impl Index {
}
}
let config = IndexDocumentsConfig {
update_method: method,
..Default::default()
};
let indexing_callback = |indexing_step| debug!("update: {:?}", indexing_step);
let mut builder = milli::update::IndexDocuments::new(
&mut txn,
self,
self.indexer_config.as_ref(),
config,
indexing_callback,
);
let content_file = file_store.get_update(content_uuid).unwrap();
let reader = DocumentBatchReader::from_reader(content_file).unwrap();
for content_uuid in contents.into_iter() {
let content_file = file_store.get_update(content_uuid)?;
let reader = DocumentBatchReader::from_reader(content_file)?;
builder.add_documents(reader)?;
}
let mut builder = self
.update_handler
.update_builder()
.index_documents(&mut txn, self);
builder.index_documents_method(method);
let addition = builder.execute(reader, indexing_callback)?;
let addition = builder.execute()?;
txn.commit()?;
@ -264,10 +268,8 @@ impl Index {
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
// We must use the write transaction of the update here.
let mut txn = self.write_txn()?;
let mut builder = self
.update_handler
.update_builder()
.settings(&mut txn, self);
let mut builder =
milli::update::Settings::new(&mut txn, self, self.indexer_config.as_ref());
apply_settings_to_builder(settings, &mut builder);