2021-06-08 17:44:25 +02:00
|
|
|
mod update_store;
|
|
|
|
|
2021-03-29 19:15:47 +02:00
|
|
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
2020-12-21 12:52:09 +01:00
|
|
|
use std::fmt::Display;
|
2021-04-07 14:33:44 +03:00
|
|
|
use std::fs::{create_dir_all, File};
|
2022-06-14 17:21:54 +02:00
|
|
|
use std::io::{BufReader, Cursor, Read};
|
2020-05-31 17:48:13 +02:00
|
|
|
use std::net::SocketAddr;
|
2021-03-24 15:06:54 +01:00
|
|
|
use std::num::{NonZeroU32, NonZeroUsize};
|
2020-05-31 17:48:13 +02:00
|
|
|
use std::path::PathBuf;
|
|
|
|
use std::str::FromStr;
|
2020-10-19 16:03:17 +02:00
|
|
|
use std::sync::Arc;
|
2020-05-31 17:48:13 +02:00
|
|
|
use std::time::Instant;
|
2021-06-16 18:33:33 +02:00
|
|
|
use std::{io, mem};
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-07-11 14:17:37 +02:00
|
|
|
use askama_warp::Template;
|
2020-12-20 11:55:21 +01:00
|
|
|
use byte_unit::Byte;
|
2021-01-07 10:15:31 +01:00
|
|
|
use either::Either;
|
2020-10-24 16:23:08 +02:00
|
|
|
use flate2::read::GzDecoder;
|
2021-06-16 18:33:33 +02:00
|
|
|
use futures::{stream, FutureExt, StreamExt};
|
2020-05-31 17:48:13 +02:00
|
|
|
use heed::EnvOpenOptions;
|
2022-06-14 17:21:54 +02:00
|
|
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
2022-06-02 18:15:36 +02:00
|
|
|
use milli::tokenizer::TokenizerBuilder;
|
2021-06-16 18:33:33 +02:00
|
|
|
use milli::update::UpdateIndexingStep::*;
|
2021-12-08 14:12:07 +01:00
|
|
|
use milli::update::{
|
|
|
|
ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
|
|
|
|
};
|
2021-09-28 11:15:24 +02:00
|
|
|
use milli::{
|
2022-04-12 13:42:14 +02:00
|
|
|
obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
|
2022-06-15 15:36:27 +02:00
|
|
|
MatcherBuilder, Object, SearchResult, SortError,
|
2021-09-28 11:15:24 +02:00
|
|
|
};
|
2020-11-02 19:11:22 +01:00
|
|
|
use once_cell::sync::OnceCell;
|
2021-04-07 14:33:44 +03:00
|
|
|
use serde::{Deserialize, Serialize};
|
2022-06-15 15:36:27 +02:00
|
|
|
use serde_json::Value;
|
2020-05-31 17:48:13 +02:00
|
|
|
use structopt::StructOpt;
|
2020-10-19 16:03:17 +02:00
|
|
|
use tokio::fs::File as TFile;
|
|
|
|
use tokio::io::AsyncWriteExt;
|
2020-10-20 11:19:34 +02:00
|
|
|
use tokio::sync::broadcast;
|
2022-03-14 17:13:07 +01:00
|
|
|
use tokio_stream::wrappers::BroadcastStream;
|
2021-04-07 14:33:44 +03:00
|
|
|
use warp::filters::ws::Message;
|
2021-06-16 18:33:33 +02:00
|
|
|
use warp::http::Response;
|
|
|
|
use warp::Filter;
|
2020-06-10 22:05:01 +02:00
|
|
|
|
2021-06-08 17:44:25 +02:00
|
|
|
use self::update_store::UpdateStore;
|
|
|
|
|
2021-06-22 14:17:56 +02:00
|
|
|
#[cfg(target_os = "linux")]
|
|
|
|
#[global_allocator]
|
|
|
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
static GLOBAL_CONFIG: OnceCell<IndexerConfig> = OnceCell::new();
|
2020-11-02 19:11:22 +01:00
|
|
|
|
2020-05-31 17:48:13 +02:00
|
|
|
#[derive(Debug, StructOpt)]
|
2020-10-19 13:44:17 +02:00
|
|
|
/// The HTTP main server of the milli project.
|
|
|
|
pub struct Opt {
|
2020-05-31 17:48:13 +02:00
|
|
|
/// The database path where the LMDB database is located.
|
|
|
|
/// It is created if it doesn't already exist.
|
|
|
|
#[structopt(long = "db", parse(from_os_str))]
|
|
|
|
database: PathBuf,
|
|
|
|
|
|
|
|
/// The maximum size the database can take on disk. It is recommended to specify
|
|
|
|
/// the whole disk space (value must be a multiple of a page size).
|
2020-12-20 11:55:21 +01:00
|
|
|
#[structopt(long = "db-size", default_value = "100 GiB")]
|
|
|
|
database_size: Byte,
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-10-19 16:03:17 +02:00
|
|
|
/// The maximum size the database that stores the updates can take on disk. It is recommended
|
|
|
|
/// to specify the whole disk space (value must be a multiple of a page size).
|
2020-12-20 11:55:21 +01:00
|
|
|
#[structopt(long = "udb-size", default_value = "10 GiB")]
|
|
|
|
update_database_size: Byte,
|
2020-10-19 16:03:17 +02:00
|
|
|
|
2020-07-14 11:27:46 +02:00
|
|
|
/// Disable document highlighting on the dashboard.
|
|
|
|
#[structopt(long)]
|
|
|
|
disable_highlighting: bool,
|
|
|
|
|
2020-07-12 11:04:35 +02:00
|
|
|
/// Verbose mode (-v, -vv, -vvv, etc.)
|
|
|
|
#[structopt(short, long, parse(from_occurrences))]
|
|
|
|
verbose: usize,
|
|
|
|
|
2020-05-31 17:48:13 +02:00
|
|
|
/// The ip and port on which the database will listen for HTTP requests.
|
|
|
|
#[structopt(short = "l", long, default_value = "127.0.0.1:9700")]
|
|
|
|
http_listen_addr: String,
|
2020-10-20 14:20:17 +02:00
|
|
|
|
|
|
|
#[structopt(flatten)]
|
|
|
|
indexer: IndexerOpt,
|
2020-05-31 17:48:13 +02:00
|
|
|
}
|
|
|
|
|
2020-10-26 20:18:10 +01:00
|
|
|
#[derive(Debug, Clone, StructOpt)]
|
|
|
|
pub struct IndexerOpt {
|
|
|
|
/// The amount of documents to skip before printing
|
|
|
|
/// a log regarding the indexing advancement.
|
2020-11-09 17:34:52 +01:00
|
|
|
#[structopt(long, default_value = "100000")] // 100k
|
2020-10-26 20:18:10 +01:00
|
|
|
pub log_every_n: usize,
|
|
|
|
|
|
|
|
/// MTBL max number of chunks in bytes.
|
|
|
|
#[structopt(long)]
|
|
|
|
pub max_nb_chunks: Option<usize>,
|
|
|
|
|
|
|
|
/// The maximum amount of memory to use for the MTBL buffer. It is recommended
|
|
|
|
/// to use something like 80%-90% of the available memory.
|
|
|
|
///
|
|
|
|
/// It is automatically split by the number of jobs e.g. if you use 7 jobs
|
|
|
|
/// and 7 GB of max memory, each thread will use a maximum of 1 GB.
|
2020-12-20 11:55:21 +01:00
|
|
|
#[structopt(long, default_value = "7 GiB")]
|
|
|
|
pub max_memory: Byte,
|
2020-10-26 20:18:10 +01:00
|
|
|
|
|
|
|
/// Size of the linked hash map cache when indexing.
|
|
|
|
/// The bigger it is, the faster the indexing is but the more memory it takes.
|
|
|
|
#[structopt(long, default_value = "500")]
|
|
|
|
pub linked_hash_map_size: usize,
|
|
|
|
|
|
|
|
/// The name of the compression algorithm to use when compressing intermediate
|
|
|
|
/// chunks during indexing documents.
|
|
|
|
///
|
|
|
|
/// Choosing a fast algorithm will make the indexing faster but may consume more memory.
|
2021-09-08 14:10:39 +02:00
|
|
|
#[structopt(long, possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])]
|
|
|
|
pub chunk_compression_type: Option<CompressionType>,
|
2020-10-26 20:18:10 +01:00
|
|
|
|
|
|
|
/// The level of compression of the chosen algorithm.
|
|
|
|
#[structopt(long, requires = "chunk-compression-type")]
|
|
|
|
pub chunk_compression_level: Option<u32>,
|
|
|
|
|
|
|
|
/// The number of bytes to remove from the begining of the chunks while reading/sorting
|
|
|
|
/// or merging them.
|
|
|
|
///
|
|
|
|
/// File fusing must only be enable on file systems that support the `FALLOC_FL_COLLAPSE_RANGE`,
|
|
|
|
/// (i.e. ext4 and XFS). File fusing will only work if the `enable-chunk-fusing` is set.
|
2020-12-20 11:55:21 +01:00
|
|
|
#[structopt(long, default_value = "4 GiB")]
|
|
|
|
pub chunk_fusing_shrink_size: Byte,
|
2020-10-26 20:18:10 +01:00
|
|
|
|
|
|
|
/// Enable the chunk fusing or not, this reduces the amount of disk used by a factor of 2.
|
|
|
|
#[structopt(long)]
|
|
|
|
pub enable_chunk_fusing: bool,
|
|
|
|
|
|
|
|
/// Number of parallel jobs for indexing, defaults to # of CPUs.
|
|
|
|
#[structopt(long)]
|
|
|
|
pub indexing_jobs: Option<usize>,
|
2021-10-06 12:11:07 +02:00
|
|
|
|
|
|
|
/// Maximum relative position in an attribute for a word to be indexed.
|
|
|
|
/// Any value higher than 65535 will be clamped.
|
|
|
|
#[structopt(long)]
|
|
|
|
pub max_positions_per_attributes: Option<u32>,
|
2020-10-26 20:18:10 +01:00
|
|
|
}
|
|
|
|
|
2022-06-02 15:55:26 +02:00
|
|
|
struct Highlighter<'s, A> {
|
2022-06-02 18:15:36 +02:00
|
|
|
matcher_builder: MatcherBuilder<'s, A>,
|
2020-12-23 20:04:19 +01:00
|
|
|
}
|
|
|
|
|
2022-06-02 15:55:26 +02:00
|
|
|
impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
|
2022-06-02 18:15:36 +02:00
|
|
|
fn new(matcher_builder: MatcherBuilder<'s, A>) -> Self {
|
|
|
|
Self { matcher_builder }
|
2020-12-23 20:04:19 +01:00
|
|
|
}
|
|
|
|
|
2022-06-02 18:15:36 +02:00
|
|
|
fn highlight_value(&self, value: Value) -> Value {
|
2020-11-05 13:58:07 +01:00
|
|
|
match value {
|
|
|
|
Value::Null => Value::Null,
|
|
|
|
Value::Bool(boolean) => Value::Bool(boolean),
|
|
|
|
Value::Number(number) => Value::Number(number),
|
|
|
|
Value::String(old_string) => {
|
2022-06-02 18:15:36 +02:00
|
|
|
let mut matcher = self.matcher_builder.build(&old_string);
|
2022-03-30 10:50:23 +02:00
|
|
|
|
2022-04-12 13:42:14 +02:00
|
|
|
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
|
|
|
|
|
|
|
Value::String(matcher.format(format_options).to_string())
|
2021-04-07 14:33:44 +03:00
|
|
|
}
|
2022-06-02 18:15:36 +02:00
|
|
|
Value::Array(values) => {
|
|
|
|
Value::Array(values.into_iter().map(|v| self.highlight_value(v)).collect())
|
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
Value::Object(object) => Value::Object(
|
2022-06-02 18:15:36 +02:00
|
|
|
object.into_iter().map(|(k, v)| (k, self.highlight_value(v))).collect(),
|
2021-06-16 18:33:33 +02:00
|
|
|
),
|
2020-11-05 13:58:07 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-15 15:36:27 +02:00
|
|
|
fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet<String>) {
|
2020-12-23 20:04:19 +01:00
|
|
|
// TODO do we need to create a string for element that are not and needs to be highlight?
|
|
|
|
for (key, value) in object.iter_mut() {
|
|
|
|
if attributes_to_highlight.contains(key) {
|
|
|
|
let old_value = mem::take(value);
|
2022-06-02 18:15:36 +02:00
|
|
|
*value = self.highlight_value(old_value);
|
2020-12-23 20:04:19 +01:00
|
|
|
}
|
2020-08-30 21:50:30 +02:00
|
|
|
}
|
2020-08-05 13:52:27 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-11 14:17:37 +02:00
|
|
|
#[derive(Template)]
|
|
|
|
#[template(path = "index.html")]
|
|
|
|
struct IndexTemplate {
|
|
|
|
db_name: String,
|
|
|
|
db_size: usize,
|
|
|
|
docs_count: usize,
|
|
|
|
}
|
|
|
|
|
2020-10-19 19:57:15 +02:00
|
|
|
#[derive(Template)]
|
|
|
|
#[template(path = "updates.html")]
|
2020-12-21 12:52:09 +01:00
|
|
|
struct UpdatesTemplate<M: Serialize + Send, P: Serialize + Send, N: Serialize + Send + Display> {
|
2020-10-19 19:57:15 +02:00
|
|
|
db_name: String,
|
2020-10-20 12:09:38 +02:00
|
|
|
db_size: usize,
|
|
|
|
docs_count: usize,
|
2020-10-21 15:38:28 +02:00
|
|
|
updates: Vec<UpdateStatus<M, P, N>>,
|
2020-10-20 12:09:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
|
|
#[serde(tag = "type")]
|
2020-10-21 15:38:28 +02:00
|
|
|
enum UpdateStatus<M, P, N> {
|
2020-10-20 12:09:38 +02:00
|
|
|
Pending { update_id: u64, meta: M },
|
2020-10-21 15:38:28 +02:00
|
|
|
Progressing { update_id: u64, meta: P },
|
|
|
|
Processed { update_id: u64, meta: N },
|
2020-11-29 12:23:52 +01:00
|
|
|
Aborted { update_id: u64, meta: M },
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<M, P, N> UpdateStatus<M, P, N> {
|
|
|
|
fn update_id(&self) -> u64 {
|
|
|
|
match self {
|
|
|
|
UpdateStatus::Pending { update_id, .. } => *update_id,
|
|
|
|
UpdateStatus::Progressing { update_id, .. } => *update_id,
|
|
|
|
UpdateStatus::Processed { update_id, .. } => *update_id,
|
|
|
|
UpdateStatus::Aborted { update_id, .. } => *update_id,
|
|
|
|
}
|
|
|
|
}
|
2020-10-21 15:38:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(tag = "type")]
|
|
|
|
enum UpdateMeta {
|
2020-12-20 23:10:09 +01:00
|
|
|
DocumentsAddition { method: String, format: String, encoding: Option<String> },
|
2020-10-30 13:12:55 +01:00
|
|
|
ClearDocuments,
|
2020-11-02 15:47:21 +01:00
|
|
|
Settings(Settings),
|
2020-11-23 13:08:57 +01:00
|
|
|
Facets(Facets),
|
2020-10-21 15:38:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(tag = "type")]
|
|
|
|
enum UpdateMetaProgress {
|
2021-06-16 18:33:33 +02:00
|
|
|
DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option<usize> },
|
2020-10-19 19:57:15 +02:00
|
|
|
}
|
|
|
|
|
2021-04-07 15:06:14 +03:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
2020-11-13 16:16:07 +01:00
|
|
|
#[serde(deny_unknown_fields)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
2020-11-02 15:47:21 +01:00
|
|
|
struct Settings {
|
2021-04-07 14:33:44 +03:00
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
displayed_attributes: Setting<Vec<String>>,
|
|
|
|
|
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
searchable_attributes: Setting<Vec<String>>,
|
|
|
|
|
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
2021-06-01 15:10:34 +02:00
|
|
|
filterable_attributes: Setting<HashSet<String>>,
|
2021-04-07 14:33:44 +03:00
|
|
|
|
2021-08-30 16:12:05 +02:00
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
sortable_attributes: Setting<HashSet<String>>,
|
|
|
|
|
2021-04-07 14:33:44 +03:00
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
criteria: Setting<Vec<String>>,
|
|
|
|
|
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
stop_words: Setting<BTreeSet<String>>,
|
2021-04-07 11:53:57 +03:00
|
|
|
|
|
|
|
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
|
|
|
synonyms: Setting<HashMap<String, Vec<String>>>,
|
2020-11-02 15:47:21 +01:00
|
|
|
}
|
|
|
|
|
2020-11-17 21:19:25 +01:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(deny_unknown_fields)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
2020-11-23 13:08:57 +01:00
|
|
|
struct Facets {
|
2020-11-28 12:43:43 +01:00
|
|
|
level_group_size: Option<NonZeroUsize>,
|
|
|
|
min_level_size: Option<NonZeroUsize>,
|
2020-11-17 21:19:25 +01:00
|
|
|
}
|
|
|
|
|
2021-02-18 18:33:51 +01:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(deny_unknown_fields)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
struct WordsPrefixes {
|
|
|
|
threshold: Option<f64>,
|
|
|
|
max_prefix_length: Option<usize>,
|
|
|
|
}
|
|
|
|
|
2021-03-17 14:32:00 +01:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(deny_unknown_fields)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
struct WordsLevelPositions {
|
2021-03-24 15:06:54 +01:00
|
|
|
level_group_size: Option<NonZeroU32>,
|
|
|
|
min_level_size: Option<NonZeroU32>,
|
2021-03-17 14:32:00 +01:00
|
|
|
}
|
|
|
|
|
2020-11-05 11:16:39 +01:00
|
|
|
#[tokio::main]
|
|
|
|
async fn main() -> anyhow::Result<()> {
|
|
|
|
let opt = Opt::from_args();
|
|
|
|
|
2020-07-12 11:04:35 +02:00
|
|
|
stderrlog::new()
|
|
|
|
.verbosity(opt.verbose)
|
|
|
|
.show_level(false)
|
|
|
|
.timestamp(stderrlog::Timestamp::Off)
|
|
|
|
.init()?;
|
|
|
|
|
2020-10-20 15:00:58 +02:00
|
|
|
create_dir_all(&opt.database)?;
|
2020-10-30 10:56:35 +01:00
|
|
|
let mut options = EnvOpenOptions::new();
|
2020-12-20 11:55:21 +01:00
|
|
|
options.map_size(opt.database_size.get_bytes() as usize);
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-11-02 19:11:22 +01:00
|
|
|
// Setup the global thread pool
|
|
|
|
let jobs = opt.indexer.indexing_jobs.unwrap_or(0);
|
|
|
|
let pool = rayon::ThreadPoolBuilder::new().num_threads(jobs).build()?;
|
2021-12-08 14:12:07 +01:00
|
|
|
|
|
|
|
let config = IndexerConfig {
|
|
|
|
max_nb_chunks: opt.indexer.max_nb_chunks,
|
|
|
|
chunk_compression_level: opt.indexer.chunk_compression_level,
|
|
|
|
max_positions_per_attributes: opt.indexer.max_positions_per_attributes,
|
|
|
|
thread_pool: Some(pool),
|
|
|
|
log_every_n: Some(opt.indexer.log_every_n),
|
|
|
|
max_memory: Some(opt.indexer.max_memory.get_bytes() as usize),
|
|
|
|
chunk_compression_type: opt.indexer.chunk_compression_type.unwrap_or(CompressionType::None),
|
|
|
|
..Default::default()
|
|
|
|
};
|
|
|
|
|
|
|
|
GLOBAL_CONFIG.set(config).unwrap();
|
2020-11-02 19:11:22 +01:00
|
|
|
|
2020-08-07 13:11:31 +02:00
|
|
|
// Open the LMDB database.
|
2020-10-30 10:56:35 +01:00
|
|
|
let index = Index::new(options, &opt.database)?;
|
2020-08-07 13:11:31 +02:00
|
|
|
|
2020-10-19 16:03:17 +02:00
|
|
|
// Setup the LMDB based update database.
|
|
|
|
let mut update_store_options = EnvOpenOptions::new();
|
2020-12-20 11:55:21 +01:00
|
|
|
update_store_options.map_size(opt.update_database_size.get_bytes() as usize);
|
2020-10-19 16:03:17 +02:00
|
|
|
|
|
|
|
let update_store_path = opt.database.join("updates.mdb");
|
|
|
|
create_dir_all(&update_store_path)?;
|
|
|
|
|
2020-10-20 11:19:34 +02:00
|
|
|
let (update_status_sender, _) = broadcast::channel(100);
|
2020-10-19 16:03:17 +02:00
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
2020-10-20 15:00:58 +02:00
|
|
|
let index_cloned = index.clone();
|
2020-10-19 16:03:17 +02:00
|
|
|
let update_store = UpdateStore::open(
|
|
|
|
update_store_options,
|
|
|
|
update_store_path,
|
2020-12-22 11:23:25 +01:00
|
|
|
// the type hint is necessary: https://github.com/rust-lang/rust/issues/32600
|
2021-04-07 14:33:44 +03:00
|
|
|
move |update_id, meta, content: &_| {
|
2020-10-26 20:18:10 +01:00
|
|
|
// We prepare the update by using the update builder.
|
|
|
|
|
2020-12-20 23:43:31 +01:00
|
|
|
let before_update = Instant::now();
|
2020-10-26 20:18:10 +01:00
|
|
|
// we extract the update type and execute the update itself.
|
2021-12-08 14:12:07 +01:00
|
|
|
let result: anyhow::Result<()> = (|| match meta {
|
|
|
|
UpdateMeta::DocumentsAddition { method, format, encoding } => {
|
|
|
|
// We must use the write transaction of the update here.
|
|
|
|
let mut wtxn = index_cloned.write_txn()?;
|
|
|
|
let update_method = match method.as_str() {
|
|
|
|
"replace" => IndexDocumentsMethod::ReplaceDocuments,
|
|
|
|
"update" => IndexDocumentsMethod::UpdateDocuments,
|
|
|
|
otherwise => panic!("invalid indexing method {:?}", otherwise),
|
|
|
|
};
|
|
|
|
let indexing_config = IndexDocumentsConfig {
|
|
|
|
update_method,
|
|
|
|
autogenerate_docids: true,
|
|
|
|
..Default::default()
|
|
|
|
};
|
|
|
|
|
|
|
|
let indexing_callback = |indexing_step| {
|
|
|
|
let (current, total) = match indexing_step {
|
|
|
|
RemapDocumentAddition { documents_seen } => (documents_seen, None),
|
|
|
|
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
|
|
|
|
(documents_seen, Some(total_documents))
|
|
|
|
}
|
|
|
|
IndexDocuments { documents_seen, total_documents } => {
|
|
|
|
(documents_seen, Some(total_documents))
|
|
|
|
}
|
|
|
|
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
|
|
|
|
(databases_seen, Some(total_databases))
|
|
|
|
}
|
2021-08-31 11:44:15 +02:00
|
|
|
};
|
2021-12-08 14:12:07 +01:00
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
|
|
|
update_id,
|
|
|
|
meta: UpdateMetaProgress::DocumentsAddition {
|
|
|
|
step: indexing_step.step(),
|
|
|
|
total_steps: indexing_step.number_of_steps(),
|
|
|
|
current,
|
|
|
|
total,
|
|
|
|
},
|
2020-10-26 20:18:10 +01:00
|
|
|
});
|
2021-12-08 14:12:07 +01:00
|
|
|
};
|
|
|
|
|
2022-06-15 14:35:19 +02:00
|
|
|
let builder = milli::update::IndexDocuments::new(
|
2021-12-08 14:12:07 +01:00
|
|
|
&mut wtxn,
|
|
|
|
&index_cloned,
|
|
|
|
GLOBAL_CONFIG.get().unwrap(),
|
|
|
|
indexing_config,
|
|
|
|
indexing_callback,
|
2022-03-23 17:28:41 +01:00
|
|
|
)?;
|
2021-12-08 14:12:07 +01:00
|
|
|
|
|
|
|
let reader = match encoding.as_deref() {
|
|
|
|
Some("gzip") => Box::new(GzDecoder::new(content)),
|
|
|
|
None => Box::new(content) as Box<dyn io::Read>,
|
|
|
|
otherwise => panic!("invalid encoding format {:?}", otherwise),
|
|
|
|
};
|
|
|
|
|
|
|
|
let documents = match format.as_str() {
|
|
|
|
"csv" => documents_from_csv(reader)?,
|
|
|
|
"json" => documents_from_json(reader)?,
|
|
|
|
"jsonl" => documents_from_jsonl(reader)?,
|
|
|
|
otherwise => panic!("invalid update format {:?}", otherwise),
|
|
|
|
};
|
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
|
2021-12-08 14:12:07 +01:00
|
|
|
|
2022-06-15 14:35:19 +02:00
|
|
|
let (builder, user_error) = builder.add_documents(documents)?;
|
|
|
|
let _count = user_error?;
|
2021-12-08 14:12:07 +01:00
|
|
|
let result = builder.execute();
|
|
|
|
|
|
|
|
match result {
|
|
|
|
Ok(_) => wtxn.commit().map_err(Into::into),
|
|
|
|
Err(e) => Err(e.into()),
|
2020-10-30 13:12:55 +01:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
}
|
|
|
|
UpdateMeta::ClearDocuments => {
|
|
|
|
// We must use the write transaction of the update here.
|
|
|
|
let mut wtxn = index_cloned.write_txn()?;
|
|
|
|
let builder = ClearDocuments::new(&mut wtxn, &index_cloned);
|
|
|
|
|
|
|
|
match builder.execute() {
|
|
|
|
Ok(_count) => wtxn.commit().map_err(Into::into),
|
|
|
|
Err(e) => Err(e.into()),
|
2020-11-03 19:35:55 +01:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
}
|
|
|
|
UpdateMeta::Settings(settings) => {
|
|
|
|
// We must use the write transaction of the update here.
|
|
|
|
let mut wtxn = index_cloned.write_txn()?;
|
|
|
|
let mut builder = milli::update::Settings::new(
|
|
|
|
&mut wtxn,
|
|
|
|
&index_cloned,
|
|
|
|
GLOBAL_CONFIG.get().unwrap(),
|
|
|
|
);
|
|
|
|
|
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.searchable_attributes {
|
|
|
|
Setting::Set(searchable_attributes) => {
|
|
|
|
builder.set_searchable_fields(searchable_attributes)
|
2021-06-16 18:33:33 +02:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
Setting::Reset => builder.reset_searchable_fields(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2020-11-03 19:35:55 +01:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.displayed_attributes {
|
|
|
|
Setting::Set(displayed_attributes) => {
|
|
|
|
builder.set_displayed_fields(displayed_attributes)
|
2021-06-16 18:33:33 +02:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
Setting::Reset => builder.reset_displayed_fields(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2020-11-02 15:47:21 +01:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.filterable_attributes {
|
|
|
|
Setting::Set(filterable_attributes) => {
|
|
|
|
builder.set_filterable_fields(filterable_attributes)
|
2021-06-16 18:33:33 +02:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
Setting::Reset => builder.reset_filterable_fields(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2020-11-13 16:16:07 +01:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.sortable_attributes {
|
|
|
|
Setting::Set(sortable_attributes) => {
|
|
|
|
builder.set_sortable_fields(sortable_attributes)
|
2021-08-30 16:12:05 +02:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
Setting::Reset => builder.reset_sortable_fields(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2021-08-30 16:12:05 +02:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.criteria {
|
|
|
|
Setting::Set(criteria) => builder.set_criteria(criteria),
|
|
|
|
Setting::Reset => builder.reset_criteria(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2020-12-04 12:02:22 +01:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.stop_words {
|
|
|
|
Setting::Set(stop_words) => builder.set_stop_words(stop_words),
|
|
|
|
Setting::Reset => builder.reset_stop_words(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2021-03-29 19:15:47 +02:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
// We transpose the settings JSON struct into a real setting update.
|
|
|
|
match settings.synonyms {
|
|
|
|
Setting::Set(synonyms) => builder.set_synonyms(synonyms),
|
|
|
|
Setting::Reset => builder.reset_synonyms(),
|
|
|
|
Setting::NotSet => (),
|
|
|
|
}
|
2021-04-07 11:53:57 +03:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
let result = builder.execute(|indexing_step| {
|
|
|
|
let (current, total) = match indexing_step {
|
|
|
|
RemapDocumentAddition { documents_seen } => (documents_seen, None),
|
|
|
|
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => {
|
|
|
|
(documents_seen, Some(total_documents))
|
|
|
|
}
|
|
|
|
IndexDocuments { documents_seen, total_documents } => {
|
|
|
|
(documents_seen, Some(total_documents))
|
|
|
|
}
|
|
|
|
MergeDataIntoFinalDatabase { databases_seen, total_databases } => {
|
|
|
|
(databases_seen, Some(total_databases))
|
|
|
|
}
|
|
|
|
};
|
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
|
|
|
update_id,
|
|
|
|
meta: UpdateMetaProgress::DocumentsAddition {
|
|
|
|
step: indexing_step.step(),
|
|
|
|
total_steps: indexing_step.number_of_steps(),
|
|
|
|
current,
|
|
|
|
total,
|
|
|
|
},
|
2020-11-03 13:20:11 +01:00
|
|
|
});
|
2021-12-08 14:12:07 +01:00
|
|
|
});
|
2020-11-03 13:20:11 +01:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
match result {
|
|
|
|
Ok(_count) => wtxn.commit().map_err(Into::into),
|
|
|
|
Err(e) => Err(e.into()),
|
2020-11-17 21:19:25 +01:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
}
|
|
|
|
UpdateMeta::Facets(levels) => {
|
|
|
|
// We must use the write transaction of the update here.
|
|
|
|
let mut wtxn = index_cloned.write_txn()?;
|
|
|
|
let mut builder = milli::update::Facets::new(&mut wtxn, &index_cloned);
|
|
|
|
if let Some(value) = levels.level_group_size {
|
|
|
|
builder.level_group_size(value);
|
|
|
|
}
|
|
|
|
if let Some(value) = levels.min_level_size {
|
|
|
|
builder.min_level_size(value);
|
2020-11-17 21:19:25 +01:00
|
|
|
}
|
2021-12-08 14:12:07 +01:00
|
|
|
match builder.execute() {
|
|
|
|
Ok(()) => wtxn.commit().map_err(Into::into),
|
|
|
|
Err(e) => Err(e.into()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})();
|
2020-10-20 15:00:58 +02:00
|
|
|
|
|
|
|
let meta = match result {
|
2021-06-16 18:33:33 +02:00
|
|
|
Ok(()) => {
|
|
|
|
format!("valid update content processed in {:.02?}", before_update.elapsed())
|
|
|
|
}
|
2020-10-28 11:17:36 +01:00
|
|
|
Err(e) => format!("error while processing update content: {:?}", e),
|
2020-10-20 15:00:58 +02:00
|
|
|
};
|
2020-10-20 12:28:10 +02:00
|
|
|
|
2020-10-20 12:09:38 +02:00
|
|
|
let processed = UpdateStatus::Processed { update_id, meta: meta.clone() };
|
|
|
|
let _ = update_status_sender_cloned.send(processed);
|
2020-10-20 15:00:58 +02:00
|
|
|
|
2020-10-19 16:03:17 +02:00
|
|
|
Ok(meta)
|
2021-06-16 18:33:33 +02:00
|
|
|
},
|
|
|
|
)?;
|
2020-10-19 16:03:17 +02:00
|
|
|
|
2020-10-20 15:00:58 +02:00
|
|
|
// The database name will not change.
|
2020-07-11 14:17:37 +02:00
|
|
|
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
|
2020-10-20 15:00:58 +02:00
|
|
|
let lmdb_path = opt.database.join("data.mdb");
|
2020-07-11 14:17:37 +02:00
|
|
|
|
2020-05-31 17:48:13 +02:00
|
|
|
// We run and wait on the HTTP server
|
|
|
|
|
|
|
|
// Expose an HTML page to debug the search in a browser
|
2020-10-19 19:57:15 +02:00
|
|
|
let db_name_cloned = db_name.clone();
|
2020-10-20 15:00:58 +02:00
|
|
|
let lmdb_path_cloned = lmdb_path.clone();
|
|
|
|
let index_cloned = index.clone();
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_html_route =
|
|
|
|
warp::filters::method::get().and(warp::filters::path::end()).map(move || {
|
2020-10-20 15:00:58 +02:00
|
|
|
// We retrieve the database size.
|
2021-06-16 18:33:33 +02:00
|
|
|
let db_size =
|
|
|
|
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize;
|
2020-10-20 15:00:58 +02:00
|
|
|
|
|
|
|
// And the number of documents in the database.
|
2020-10-30 11:42:00 +01:00
|
|
|
let rtxn = index_cloned.read_txn().unwrap();
|
2020-10-20 15:00:58 +02:00
|
|
|
let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize;
|
|
|
|
|
|
|
|
IndexTemplate { db_name: db_name_cloned.clone(), db_size, docs_count }
|
|
|
|
});
|
2020-10-19 19:57:15 +02:00
|
|
|
|
|
|
|
let update_store_cloned = update_store.clone();
|
2020-10-20 15:00:58 +02:00
|
|
|
let lmdb_path_cloned = lmdb_path.clone();
|
|
|
|
let index_cloned = index.clone();
|
2020-10-19 19:57:15 +02:00
|
|
|
let updates_list_or_html_route = warp::filters::method::get()
|
|
|
|
.and(warp::header("Accept"))
|
|
|
|
.and(warp::path!("updates"))
|
|
|
|
.map(move |header: String| {
|
|
|
|
let update_store = update_store_cloned.clone();
|
2021-06-16 18:33:33 +02:00
|
|
|
let mut updates = update_store
|
|
|
|
.iter_metas(|processed, aborted, pending| {
|
|
|
|
let mut updates = Vec::<UpdateStatus<_, UpdateMetaProgress, _>>::new();
|
|
|
|
for result in processed {
|
|
|
|
let (uid, meta) = result?;
|
|
|
|
updates.push(UpdateStatus::Processed { update_id: uid.get(), meta });
|
|
|
|
}
|
|
|
|
for result in aborted {
|
|
|
|
let (uid, meta) = result?;
|
|
|
|
updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta });
|
|
|
|
}
|
|
|
|
for result in pending {
|
|
|
|
let (uid, meta) = result?;
|
|
|
|
updates.push(UpdateStatus::Pending { update_id: uid.get(), meta });
|
|
|
|
}
|
|
|
|
Ok(updates)
|
|
|
|
})
|
|
|
|
.unwrap();
|
2020-10-19 19:57:15 +02:00
|
|
|
|
2020-11-29 12:23:52 +01:00
|
|
|
updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse());
|
2020-10-20 15:00:58 +02:00
|
|
|
|
2020-11-29 12:23:52 +01:00
|
|
|
if header.contains("text/html") {
|
2020-10-20 15:00:58 +02:00
|
|
|
// We retrieve the database size.
|
2021-06-16 18:33:33 +02:00
|
|
|
let db_size =
|
|
|
|
File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len()
|
|
|
|
as usize;
|
2020-10-20 15:00:58 +02:00
|
|
|
|
|
|
|
// And the number of documents in the database.
|
2020-10-30 11:42:00 +01:00
|
|
|
let rtxn = index_cloned.read_txn().unwrap();
|
2020-10-20 15:00:58 +02:00
|
|
|
let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize;
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let template =
|
|
|
|
UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates };
|
2020-10-19 19:57:15 +02:00
|
|
|
Box::new(template) as Box<dyn warp::Reply>
|
|
|
|
} else {
|
|
|
|
Box::new(warp::reply::json(&updates))
|
|
|
|
}
|
|
|
|
});
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_bulma_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
|
|
|
.body(include_str!("../public/bulma.min.css"))
|
|
|
|
});
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_bulma_dark_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
|
|
|
.body(include_str!("../public/bulma-prefers-dark.min.css"))
|
|
|
|
});
|
2020-07-13 23:51:41 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| {
|
|
|
|
Response::builder()
|
2020-07-11 11:48:27 +02:00
|
|
|
.header("content-type", "text/css; charset=utf-8")
|
2020-11-05 11:16:39 +01:00
|
|
|
.body(include_str!("../public/style.css"))
|
2021-06-16 18:33:33 +02:00
|
|
|
});
|
2020-07-11 11:48:27 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_jquery_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../public/jquery-3.4.1.min.js"))
|
|
|
|
});
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_filesize_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../public/filesize.min.js"))
|
|
|
|
});
|
2020-07-11 14:17:37 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| {
|
|
|
|
Response::builder()
|
2020-07-11 11:48:27 +02:00
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
2020-11-05 11:16:39 +01:00
|
|
|
.body(include_str!("../public/script.js"))
|
2021-06-16 18:33:33 +02:00
|
|
|
});
|
2020-07-11 11:48:27 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let updates_script_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "application/javascript; charset=utf-8")
|
|
|
|
.body(include_str!("../public/updates-script.js"))
|
|
|
|
});
|
2020-10-19 19:57:15 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_logo_white_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "image/svg+xml")
|
|
|
|
.body(include_str!("../public/logo-white.svg"))
|
|
|
|
});
|
2020-07-15 23:51:12 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let dash_logo_black_route =
|
|
|
|
warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| {
|
|
|
|
Response::builder()
|
|
|
|
.header("content-type", "image/svg+xml")
|
|
|
|
.body(include_str!("../public/logo-black.svg"))
|
|
|
|
});
|
2020-07-15 23:51:12 +02:00
|
|
|
|
2021-01-07 10:15:31 +01:00
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
#[serde(untagged)]
|
|
|
|
enum UntaggedEither<L, R> {
|
|
|
|
Left(L),
|
|
|
|
Right(R),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<L, R> From<UntaggedEither<L, R>> for Either<L, R> {
|
|
|
|
fn from(value: UntaggedEither<L, R>) -> Either<L, R> {
|
|
|
|
match value {
|
|
|
|
UntaggedEither::Left(left) => Either::Left(left),
|
|
|
|
UntaggedEither::Right(right) => Either::Right(right),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-14 13:21:22 +01:00
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
#[serde(deny_unknown_fields)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
2020-05-31 17:48:13 +02:00
|
|
|
struct QueryBody {
|
2020-10-06 14:52:05 +02:00
|
|
|
query: Option<String>,
|
2021-01-07 10:15:31 +01:00
|
|
|
filters: Option<String>,
|
2021-09-09 12:20:08 +02:00
|
|
|
sort: Option<String>,
|
2021-01-07 10:15:31 +01:00
|
|
|
facet_filters: Option<Vec<UntaggedEither<Vec<String>, String>>>,
|
2020-12-28 19:08:53 +01:00
|
|
|
facet_distribution: Option<bool>,
|
2021-06-22 14:47:23 +02:00
|
|
|
limit: Option<usize>,
|
2020-12-28 19:08:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
struct Answer {
|
2022-06-15 15:36:27 +02:00
|
|
|
documents: Vec<Object>,
|
2020-12-29 00:50:06 +01:00
|
|
|
number_of_candidates: u64,
|
2021-05-04 12:09:43 +02:00
|
|
|
facets: BTreeMap<String, BTreeMap<String, u64>>,
|
2020-05-31 17:48:13 +02:00
|
|
|
}
|
|
|
|
|
2020-07-14 11:27:46 +02:00
|
|
|
let disable_highlighting = opt.disable_highlighting;
|
2020-11-10 17:00:38 +01:00
|
|
|
let index_cloned = index.clone();
|
2020-05-31 17:48:13 +02:00
|
|
|
let query_route = warp::filters::method::post()
|
|
|
|
.and(warp::path!("query"))
|
|
|
|
.and(warp::body::json())
|
|
|
|
.map(move |query: QueryBody| {
|
|
|
|
let before_search = Instant::now();
|
2020-11-10 17:00:38 +01:00
|
|
|
let index = index_cloned.clone();
|
2020-10-30 10:56:35 +01:00
|
|
|
let rtxn = index.read_txn().unwrap();
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-10-06 14:52:05 +02:00
|
|
|
let mut search = index.search(&rtxn);
|
|
|
|
if let Some(query) = query.query {
|
|
|
|
search.query(query);
|
|
|
|
}
|
2021-01-07 10:15:31 +01:00
|
|
|
|
2021-11-04 15:52:22 +01:00
|
|
|
let filters = match query.filters.as_ref() {
|
2021-01-07 10:15:31 +01:00
|
|
|
Some(condition) if !condition.trim().is_empty() => {
|
2021-12-09 11:50:12 +01:00
|
|
|
MilliFilter::from_str(condition).unwrap()
|
2021-04-07 14:33:44 +03:00
|
|
|
}
|
2021-01-07 10:15:31 +01:00
|
|
|
_otherwise => None,
|
|
|
|
};
|
|
|
|
|
2021-11-04 15:52:22 +01:00
|
|
|
let facet_filters = match query.facet_filters.as_ref() {
|
2021-01-07 10:15:31 +01:00
|
|
|
Some(array) => {
|
2021-11-04 15:52:22 +01:00
|
|
|
let eithers = array.iter().map(|either| match either {
|
|
|
|
UntaggedEither::Left(l) => {
|
|
|
|
Either::Left(l.iter().map(|s| s.as_str()).collect::<Vec<&str>>())
|
|
|
|
}
|
|
|
|
UntaggedEither::Right(r) => Either::Right(r.as_str()),
|
|
|
|
});
|
2021-10-22 17:23:22 +02:00
|
|
|
MilliFilter::from_array(eithers).unwrap()
|
2021-04-07 14:33:44 +03:00
|
|
|
}
|
2021-01-07 10:15:31 +01:00
|
|
|
_otherwise => None,
|
|
|
|
};
|
|
|
|
|
|
|
|
let condition = match (filters, facet_filters) {
|
2021-11-04 15:52:22 +01:00
|
|
|
(Some(filters), Some(facet_filters)) => Some(FilterCondition::And(
|
|
|
|
Box::new(filters.into()),
|
|
|
|
Box::new(facet_filters.into()),
|
|
|
|
)),
|
|
|
|
(Some(condition), None) | (None, Some(condition)) => Some(condition.into()),
|
2021-01-07 10:15:31 +01:00
|
|
|
_otherwise => None,
|
|
|
|
};
|
|
|
|
|
|
|
|
if let Some(condition) = condition {
|
2021-11-04 15:52:22 +01:00
|
|
|
search.filter(condition.into());
|
2020-11-14 13:21:22 +01:00
|
|
|
}
|
2020-10-06 14:52:05 +02:00
|
|
|
|
2021-06-22 14:47:23 +02:00
|
|
|
if let Some(limit) = query.limit {
|
|
|
|
search.limit(limit);
|
|
|
|
}
|
|
|
|
|
2021-09-09 12:20:08 +02:00
|
|
|
if let Some(sort) = query.sort {
|
2021-09-28 11:15:24 +02:00
|
|
|
search.sort_criteria(vec![sort.parse().map_err(SortError::from).unwrap()]);
|
2021-08-31 18:49:06 +02:00
|
|
|
}
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let SearchResult { matching_words, candidates, documents_ids } =
|
|
|
|
search.execute().unwrap();
|
2020-12-28 19:08:53 +01:00
|
|
|
|
2020-12-29 00:50:06 +01:00
|
|
|
let number_of_candidates = candidates.len();
|
2020-12-28 19:08:53 +01:00
|
|
|
let facets = if query.facet_distribution == Some(true) {
|
2021-01-27 14:15:33 +01:00
|
|
|
Some(index.facets_distribution(&rtxn).candidates(candidates).execute().unwrap())
|
2020-12-28 19:08:53 +01:00
|
|
|
} else {
|
|
|
|
None
|
|
|
|
};
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-10-21 18:26:29 +02:00
|
|
|
let mut documents = Vec::new();
|
2020-10-25 18:32:01 +01:00
|
|
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
2021-01-20 17:27:43 +01:00
|
|
|
let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() {
|
|
|
|
Some(fields) => fields,
|
|
|
|
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
|
2020-11-02 13:01:32 +01:00
|
|
|
};
|
2020-11-05 13:58:07 +01:00
|
|
|
let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() {
|
2021-01-20 17:27:43 +01:00
|
|
|
Some(fields) => fields.into_iter().map(String::from).collect(),
|
|
|
|
None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(),
|
2020-11-05 13:58:07 +01:00
|
|
|
};
|
2020-10-22 14:23:33 +02:00
|
|
|
|
2022-06-02 18:15:36 +02:00
|
|
|
let mut matcher_builder =
|
|
|
|
MatcherBuilder::new(matching_words, TokenizerBuilder::default().build());
|
2022-03-30 10:50:23 +02:00
|
|
|
matcher_builder.highlight_prefix("<mark>".to_string());
|
|
|
|
matcher_builder.highlight_suffix("</mark>".to_string());
|
2022-06-02 18:15:36 +02:00
|
|
|
let highlighter = Highlighter::new(matcher_builder);
|
2020-11-05 13:58:07 +01:00
|
|
|
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
|
|
|
|
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
2020-10-22 14:23:33 +02:00
|
|
|
if !disable_highlighting {
|
2022-06-02 18:15:36 +02:00
|
|
|
highlighter.highlight_record(&mut object, &attributes_to_highlight);
|
2020-10-21 18:26:29 +02:00
|
|
|
}
|
2020-10-22 14:23:33 +02:00
|
|
|
|
2020-11-05 13:58:07 +01:00
|
|
|
documents.push(object);
|
2020-10-21 18:26:29 +02:00
|
|
|
}
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let answer =
|
|
|
|
Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() };
|
2020-12-28 19:08:53 +01:00
|
|
|
|
2020-05-31 17:48:13 +02:00
|
|
|
Response::builder()
|
2020-10-21 18:26:29 +02:00
|
|
|
.header("Content-Type", "application/json")
|
2020-05-31 17:48:13 +02:00
|
|
|
.header("Time-Ms", before_search.elapsed().as_millis().to_string())
|
2020-12-28 19:08:53 +01:00
|
|
|
.body(serde_json::to_string(&answer).unwrap())
|
2020-05-31 17:48:13 +02:00
|
|
|
});
|
|
|
|
|
2020-11-10 17:00:38 +01:00
|
|
|
let index_cloned = index.clone();
|
2021-06-16 18:33:33 +02:00
|
|
|
let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map(
|
|
|
|
move |id: String| {
|
2020-11-10 17:00:38 +01:00
|
|
|
let index = index_cloned.clone();
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
|
2020-11-22 11:54:04 +01:00
|
|
|
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
2020-11-10 17:00:38 +01:00
|
|
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
2021-01-20 17:27:43 +01:00
|
|
|
let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() {
|
|
|
|
Some(fields) => fields,
|
|
|
|
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
|
2020-11-10 17:00:38 +01:00
|
|
|
};
|
|
|
|
|
2020-11-22 11:54:04 +01:00
|
|
|
match external_documents_ids.get(&id) {
|
2020-11-10 17:00:38 +01:00
|
|
|
Some(document_id) => {
|
|
|
|
let document_id = document_id as u32;
|
2021-06-16 18:33:33 +02:00
|
|
|
let (_, obkv) =
|
|
|
|
index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap();
|
2020-11-10 17:00:38 +01:00
|
|
|
let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
|
|
|
|
|
|
|
|
Response::builder()
|
|
|
|
.header("Content-Type", "application/json")
|
|
|
|
.body(serde_json::to_string(&document).unwrap())
|
2021-04-07 14:33:44 +03:00
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
None => Response::builder()
|
|
|
|
.status(404)
|
|
|
|
.body(format!("Document with id {:?} not found.", id)),
|
2020-11-10 17:00:38 +01:00
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
},
|
|
|
|
);
|
2020-11-10 17:00:38 +01:00
|
|
|
|
2020-10-19 16:03:17 +02:00
|
|
|
async fn buf_stream(
|
2020-10-21 15:38:28 +02:00
|
|
|
update_store: Arc<UpdateStore<UpdateMeta, String>>,
|
2021-06-16 18:33:33 +02:00
|
|
|
update_status_sender: broadcast::Sender<
|
|
|
|
UpdateStatus<UpdateMeta, UpdateMetaProgress, String>,
|
|
|
|
>,
|
2020-10-31 17:48:24 +01:00
|
|
|
update_method: Option<String>,
|
2021-08-31 11:44:15 +02:00
|
|
|
format: String,
|
2020-12-20 23:10:09 +01:00
|
|
|
encoding: Option<String>,
|
2021-06-16 18:33:33 +02:00
|
|
|
mut stream: impl futures::Stream<Item = Result<impl bytes::Buf, warp::Error>> + Unpin,
|
|
|
|
) -> Result<impl warp::Reply, warp::Rejection> {
|
2020-10-19 16:03:17 +02:00
|
|
|
let file = tokio::task::block_in_place(tempfile::tempfile).unwrap();
|
2020-12-20 23:10:09 +01:00
|
|
|
let mut file = TFile::from_std(file);
|
2020-10-19 16:03:17 +02:00
|
|
|
|
|
|
|
while let Some(result) = stream.next().await {
|
2022-03-14 17:13:07 +01:00
|
|
|
let mut bytes = Vec::new();
|
|
|
|
result.unwrap().reader().read_to_end(&mut bytes).unwrap();
|
2020-12-20 23:10:09 +01:00
|
|
|
file.write_all(&bytes[..]).await.unwrap();
|
2020-10-19 16:03:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
let file = file.into_std().await;
|
2021-10-10 22:47:12 +01:00
|
|
|
let mmap = unsafe { memmap2::Mmap::map(&file).expect("can't map file") };
|
2020-10-19 16:03:17 +02:00
|
|
|
|
2020-10-31 17:48:24 +01:00
|
|
|
let method = match update_method.as_deref() {
|
|
|
|
Some("replace") => String::from("replace"),
|
|
|
|
Some("update") => String::from("update"),
|
|
|
|
_ => String::from("replace"),
|
|
|
|
};
|
|
|
|
|
2020-12-20 23:10:09 +01:00
|
|
|
let meta = UpdateMeta::DocumentsAddition { method, format, encoding };
|
2020-10-20 12:09:38 +02:00
|
|
|
let update_id = update_store.register_update(&meta, &mmap[..]).unwrap();
|
2020-10-20 15:14:06 +02:00
|
|
|
let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta });
|
2020-10-20 12:09:38 +02:00
|
|
|
eprintln!("update {} registered", update_id);
|
2020-10-19 16:03:17 +02:00
|
|
|
|
|
|
|
Ok(warp::reply())
|
|
|
|
}
|
|
|
|
|
2020-10-31 17:48:24 +01:00
|
|
|
#[derive(Deserialize)]
|
|
|
|
struct QueryUpdate {
|
|
|
|
method: Option<String>,
|
|
|
|
}
|
|
|
|
|
2020-10-19 16:03:17 +02:00
|
|
|
let update_store_cloned = update_store.clone();
|
2020-10-20 11:19:34 +02:00
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
2020-12-20 23:10:09 +01:00
|
|
|
let indexing_route = warp::filters::method::post()
|
2020-10-31 17:48:24 +01:00
|
|
|
.and(warp::path!("documents"))
|
2020-12-20 23:10:09 +01:00
|
|
|
.and(warp::header::header("content-type"))
|
|
|
|
.and(warp::header::optional::<String>("content-encoding"))
|
|
|
|
.and(warp::query::query())
|
2020-10-19 16:03:17 +02:00
|
|
|
.and(warp::body::stream())
|
2020-12-20 23:10:09 +01:00
|
|
|
.and_then(move |content_type: String, content_encoding, params: QueryUpdate, stream| {
|
|
|
|
let format = match content_type.as_str() {
|
2021-08-31 11:44:15 +02:00
|
|
|
"text/csv" => "csv",
|
|
|
|
"application/json" => "json",
|
|
|
|
"application/x-ndjson" => "jsonl",
|
2020-12-20 23:10:09 +01:00
|
|
|
otherwise => panic!("invalid update format: {}", otherwise),
|
|
|
|
};
|
2020-10-19 16:03:17 +02:00
|
|
|
|
2020-11-01 11:50:10 +01:00
|
|
|
buf_stream(
|
|
|
|
update_store_cloned.clone(),
|
|
|
|
update_status_sender_cloned.clone(),
|
|
|
|
params.method,
|
2021-08-31 11:44:15 +02:00
|
|
|
format.to_string(),
|
2020-12-20 23:10:09 +01:00
|
|
|
content_encoding,
|
2020-11-01 11:50:10 +01:00
|
|
|
stream,
|
|
|
|
)
|
|
|
|
});
|
|
|
|
|
2020-11-02 15:30:29 +01:00
|
|
|
let update_store_cloned = update_store.clone();
|
2020-10-21 15:38:28 +02:00
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
2021-06-16 18:33:33 +02:00
|
|
|
let clearing_route =
|
|
|
|
warp::filters::method::post().and(warp::path!("clear-documents")).map(move || {
|
2020-10-30 13:12:55 +01:00
|
|
|
let meta = UpdateMeta::ClearDocuments;
|
2020-11-02 15:47:21 +01:00
|
|
|
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
|
|
|
eprintln!("update {} registered", update_id);
|
|
|
|
Ok(warp::reply())
|
|
|
|
});
|
|
|
|
|
|
|
|
let update_store_cloned = update_store.clone();
|
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
|
|
|
let change_settings_route = warp::filters::method::post()
|
|
|
|
.and(warp::path!("settings"))
|
|
|
|
.and(warp::body::json())
|
|
|
|
.map(move |settings: Settings| {
|
|
|
|
let meta = UpdateMeta::Settings(settings);
|
|
|
|
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
2020-10-21 15:38:28 +02:00
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
|
|
|
eprintln!("update {} registered", update_id);
|
|
|
|
Ok(warp::reply())
|
|
|
|
});
|
|
|
|
|
2020-11-17 21:19:25 +01:00
|
|
|
let update_store_cloned = update_store.clone();
|
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
|
|
|
let change_facet_levels_route = warp::filters::method::post()
|
2020-11-28 12:43:43 +01:00
|
|
|
.and(warp::path!("facet-level-sizes"))
|
2020-11-17 21:19:25 +01:00
|
|
|
.and(warp::body::json())
|
2020-11-23 13:08:57 +01:00
|
|
|
.map(move |levels: Facets| {
|
|
|
|
let meta = UpdateMeta::Facets(levels);
|
2020-11-17 21:19:25 +01:00
|
|
|
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
|
|
|
eprintln!("update {} registered", update_id);
|
|
|
|
warp::reply()
|
|
|
|
});
|
|
|
|
|
2020-11-29 12:23:52 +01:00
|
|
|
let update_store_cloned = update_store.clone();
|
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
|
|
|
let abort_update_id_route = warp::filters::method::delete()
|
|
|
|
.and(warp::path!("update" / u64))
|
|
|
|
.map(move |update_id: u64| {
|
|
|
|
if let Some(meta) = update_store_cloned.abort_update(update_id).unwrap() {
|
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta });
|
|
|
|
eprintln!("update {} aborted", update_id);
|
|
|
|
}
|
|
|
|
warp::reply()
|
|
|
|
});
|
|
|
|
|
|
|
|
let update_store_cloned = update_store.clone();
|
|
|
|
let update_status_sender_cloned = update_status_sender.clone();
|
2021-06-16 18:33:33 +02:00
|
|
|
let abort_pending_updates_route =
|
|
|
|
warp::filters::method::delete().and(warp::path!("updates")).map(move || {
|
2020-11-29 12:23:52 +01:00
|
|
|
let updates = update_store_cloned.abort_pendings().unwrap();
|
|
|
|
for (update_id, meta) in updates {
|
|
|
|
let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta });
|
|
|
|
eprintln!("update {} aborted", update_id);
|
|
|
|
}
|
|
|
|
warp::reply()
|
|
|
|
});
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
let update_ws_route =
|
|
|
|
warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| {
|
2020-10-19 16:03:17 +02:00
|
|
|
// And then our closure will be called when it completes...
|
2020-10-20 11:19:34 +02:00
|
|
|
let update_status_receiver = update_status_sender.subscribe();
|
2020-10-19 16:03:17 +02:00
|
|
|
ws.on_upgrade(|websocket| {
|
|
|
|
// Just echo all updates messages...
|
2022-03-14 17:13:07 +01:00
|
|
|
BroadcastStream::new(update_status_receiver)
|
2021-06-16 18:33:33 +02:00
|
|
|
.flat_map(|result| match result {
|
|
|
|
Ok(status) => {
|
|
|
|
let msg = serde_json::to_string(&status).unwrap();
|
|
|
|
stream::iter(Some(Ok(Message::text(msg))))
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("channel error: {:?}", e);
|
|
|
|
stream::iter(None)
|
2020-10-20 11:19:34 +02:00
|
|
|
}
|
|
|
|
})
|
2020-10-19 16:03:17 +02:00
|
|
|
.forward(websocket)
|
|
|
|
.map(|result| {
|
|
|
|
if let Err(e) = result {
|
|
|
|
eprintln!("websocket error: {:?}", e);
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
});
|
|
|
|
|
2021-07-05 17:31:41 +02:00
|
|
|
let die_route = warp::filters::method::get().and(warp::path!("die")).map(move || {
|
2021-07-05 17:43:28 +02:00
|
|
|
eprintln!("Killed by an HTTP request received on the die route");
|
2021-07-05 17:31:41 +02:00
|
|
|
std::process::exit(0);
|
2021-07-05 17:43:28 +02:00
|
|
|
#[allow(unreachable_code)]
|
2021-07-05 17:31:41 +02:00
|
|
|
warp::reply()
|
|
|
|
});
|
|
|
|
|
2020-05-31 17:48:13 +02:00
|
|
|
let routes = dash_html_route
|
2020-10-19 19:57:15 +02:00
|
|
|
.or(updates_list_or_html_route)
|
2020-05-31 17:48:13 +02:00
|
|
|
.or(dash_bulma_route)
|
2020-07-13 23:51:41 +02:00
|
|
|
.or(dash_bulma_dark_route)
|
2020-07-11 11:48:27 +02:00
|
|
|
.or(dash_style_route)
|
2020-05-31 17:48:13 +02:00
|
|
|
.or(dash_jquery_route)
|
2020-07-11 14:17:37 +02:00
|
|
|
.or(dash_filesize_route)
|
2020-07-11 11:48:27 +02:00
|
|
|
.or(dash_script_route)
|
2020-10-19 19:57:15 +02:00
|
|
|
.or(updates_script_route)
|
2020-07-15 23:51:12 +02:00
|
|
|
.or(dash_logo_white_route)
|
|
|
|
.or(dash_logo_black_route)
|
2020-10-19 16:03:17 +02:00
|
|
|
.or(query_route)
|
2020-11-10 17:00:38 +01:00
|
|
|
.or(document_route)
|
2020-12-20 23:10:09 +01:00
|
|
|
.or(indexing_route)
|
2020-11-29 12:23:52 +01:00
|
|
|
.or(abort_update_id_route)
|
|
|
|
.or(abort_pending_updates_route)
|
2020-10-30 13:12:55 +01:00
|
|
|
.or(clearing_route)
|
2020-11-02 15:30:29 +01:00
|
|
|
.or(change_settings_route)
|
2020-11-17 21:19:25 +01:00
|
|
|
.or(change_facet_levels_route)
|
2021-07-05 17:31:41 +02:00
|
|
|
.or(update_ws_route)
|
|
|
|
.or(die_route);
|
2020-05-31 17:48:13 +02:00
|
|
|
|
2020-10-19 13:44:17 +02:00
|
|
|
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
2021-04-21 00:27:23 +02:00
|
|
|
warp::serve(routes).run(addr).await;
|
|
|
|
Ok(())
|
2020-05-31 17:48:13 +02:00
|
|
|
}
|
2021-04-07 15:06:14 +03:00
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
|
|
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
|
|
let reader = BufReader::new(reader);
|
2021-08-31 11:44:15 +02:00
|
|
|
|
2022-06-15 15:36:27 +02:00
|
|
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
2022-06-14 17:21:54 +02:00
|
|
|
let object = result?;
|
|
|
|
documents.append_json_object(&object)?;
|
2021-08-31 11:44:15 +02:00
|
|
|
}
|
2022-02-02 17:55:13 +01:00
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
documents.into_inner().map_err(Into::into)
|
2021-08-31 11:44:15 +02:00
|
|
|
}
|
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
|
|
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
2021-08-31 11:44:15 +02:00
|
|
|
|
2022-07-11 18:38:50 +02:00
|
|
|
documents.append_json_array(reader)?;
|
2021-08-31 11:44:15 +02:00
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
documents.into_inner().map_err(Into::into)
|
2021-08-31 11:44:15 +02:00
|
|
|
}
|
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
|
|
|
|
let csv = csv::Reader::from_reader(reader);
|
|
|
|
|
|
|
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
|
|
documents.append_csv(csv)?;
|
2021-08-31 11:44:15 +02:00
|
|
|
|
2022-06-14 17:21:54 +02:00
|
|
|
documents.into_inner().map_err(Into::into)
|
2021-08-31 11:44:15 +02:00
|
|
|
}
|
|
|
|
|
2021-04-07 15:06:14 +03:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2021-06-16 18:33:33 +02:00
|
|
|
use maplit::{btreeset, hashmap, hashset};
|
2021-04-07 15:06:14 +03:00
|
|
|
use milli::update::Setting;
|
2021-06-16 18:33:33 +02:00
|
|
|
use serde_test::{assert_tokens, Token};
|
2021-04-07 15:06:14 +03:00
|
|
|
|
|
|
|
use crate::Settings;
|
|
|
|
|
|
|
|
#[test]
|
2021-04-10 12:13:59 +03:00
|
|
|
fn serde_settings_set() {
|
2021-04-07 15:06:14 +03:00
|
|
|
let settings = Settings {
|
|
|
|
displayed_attributes: Setting::Set(vec!["name".to_string()]),
|
2021-04-10 12:13:59 +03:00
|
|
|
searchable_attributes: Setting::Set(vec!["age".to_string()]),
|
2021-06-16 18:33:33 +02:00
|
|
|
filterable_attributes: Setting::Set(hashset! { "age".to_string() }),
|
2021-08-30 16:12:05 +02:00
|
|
|
sortable_attributes: Setting::Set(hashset! { "age".to_string() }),
|
2021-08-23 11:37:18 +02:00
|
|
|
criteria: Setting::Set(vec!["age:asc".to_string()]),
|
2021-04-10 12:13:59 +03:00
|
|
|
stop_words: Setting::Set(btreeset! { "and".to_string() }),
|
2021-06-16 18:33:33 +02:00
|
|
|
synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }),
|
2021-04-07 15:06:14 +03:00
|
|
|
};
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
assert_tokens(
|
|
|
|
&settings,
|
|
|
|
&[
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::Struct { name: "Settings", len: 7 },
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::Str("displayedAttributes"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Seq { len: Some(1) },
|
|
|
|
Token::Str("name"),
|
|
|
|
Token::SeqEnd,
|
|
|
|
Token::Str("searchableAttributes"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Seq { len: Some(1) },
|
|
|
|
Token::Str("age"),
|
|
|
|
Token::SeqEnd,
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::Str("filterableAttributes"),
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::Some,
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::Seq { len: Some(1) },
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::Str("age"),
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::SeqEnd,
|
|
|
|
Token::Str("sortableAttributes"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Seq { len: Some(1) },
|
|
|
|
Token::Str("age"),
|
|
|
|
Token::SeqEnd,
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::Str("criteria"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Seq { len: Some(1) },
|
2021-08-23 11:37:18 +02:00
|
|
|
Token::Str("age:asc"),
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::SeqEnd,
|
|
|
|
Token::Str("stopWords"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Seq { len: Some(1) },
|
|
|
|
Token::Str("and"),
|
|
|
|
Token::SeqEnd,
|
|
|
|
Token::Str("synonyms"),
|
|
|
|
Token::Some,
|
|
|
|
Token::Map { len: Some(1) },
|
|
|
|
Token::Str("alex"),
|
|
|
|
Token::Seq { len: Some(1) },
|
|
|
|
Token::Str("alexey"),
|
|
|
|
Token::SeqEnd,
|
|
|
|
Token::MapEnd,
|
|
|
|
Token::StructEnd,
|
|
|
|
],
|
|
|
|
);
|
2021-04-07 15:06:14 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
2021-04-10 12:13:59 +03:00
|
|
|
fn serde_settings_reset() {
|
2021-04-07 15:06:14 +03:00
|
|
|
let settings = Settings {
|
2021-04-10 12:13:59 +03:00
|
|
|
displayed_attributes: Setting::Reset,
|
2021-04-07 15:06:14 +03:00
|
|
|
searchable_attributes: Setting::Reset,
|
2021-06-01 15:10:34 +02:00
|
|
|
filterable_attributes: Setting::Reset,
|
2021-08-30 16:12:05 +02:00
|
|
|
sortable_attributes: Setting::Reset,
|
2021-04-10 12:13:59 +03:00
|
|
|
criteria: Setting::Reset,
|
|
|
|
stop_words: Setting::Reset,
|
2021-04-09 22:56:20 +03:00
|
|
|
synonyms: Setting::Reset,
|
2021-04-07 15:06:14 +03:00
|
|
|
};
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
assert_tokens(
|
|
|
|
&settings,
|
|
|
|
&[
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::Struct { name: "Settings", len: 7 },
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::Str("displayedAttributes"),
|
|
|
|
Token::None,
|
|
|
|
Token::Str("searchableAttributes"),
|
|
|
|
Token::None,
|
2021-08-30 16:12:05 +02:00
|
|
|
Token::Str("filterableAttributes"),
|
|
|
|
Token::None,
|
|
|
|
Token::Str("sortableAttributes"),
|
2021-06-16 18:33:33 +02:00
|
|
|
Token::None,
|
|
|
|
Token::Str("criteria"),
|
|
|
|
Token::None,
|
|
|
|
Token::Str("stopWords"),
|
|
|
|
Token::None,
|
|
|
|
Token::Str("synonyms"),
|
|
|
|
Token::None,
|
|
|
|
Token::StructEnd,
|
|
|
|
],
|
|
|
|
);
|
2021-04-10 12:13:59 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn serde_settings_notset() {
|
|
|
|
let settings = Settings {
|
|
|
|
displayed_attributes: Setting::NotSet,
|
|
|
|
searchable_attributes: Setting::NotSet,
|
2021-06-01 15:10:34 +02:00
|
|
|
filterable_attributes: Setting::NotSet,
|
2021-08-30 16:12:05 +02:00
|
|
|
sortable_attributes: Setting::NotSet,
|
2021-04-10 12:13:59 +03:00
|
|
|
criteria: Setting::NotSet,
|
|
|
|
stop_words: Setting::NotSet,
|
2021-04-09 22:56:20 +03:00
|
|
|
synonyms: Setting::NotSet,
|
2021-04-10 12:13:59 +03:00
|
|
|
};
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]);
|
2021-04-07 15:06:14 +03:00
|
|
|
}
|
|
|
|
}
|