From 68d783145b382837575729860d511d67b74fa78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 1 Nov 2020 17:52:04 +0100 Subject: [PATCH 1/7] Introduce searchable fields methods on the index --- src/index.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index d025ac5aa..1a0c5d4d6 100644 --- a/src/index.rs +++ b/src/index.rs @@ -18,6 +18,7 @@ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; +pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USERS_IDS_DOCUMENTS_IDS_KEY: &str = "users-ids-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; @@ -94,7 +95,7 @@ impl Index { self.main.put::<_, Str, OwnedType>(wtxn, PRIMARY_KEY_KEY, &primary_key) } - /// Delete the primary key of the documents, this can be done to reset indexes settings. + /// Deletes the primary key of the documents, this can be done to reset indexes settings. pub fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, PRIMARY_KEY_KEY) } @@ -155,6 +156,25 @@ impl Index { self.main.get::<_, Str, ByteSlice>(rtxn, DISPLAYED_FIELDS_KEY) } + /* searchable fields */ + + /// Writes the searchable fields, when this list is specified, only these are indexed. + pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[u8]) -> heed::Result<()> { + assert!(fields.windows(2).all(|win| win[0] < win[1])); // is sorted + self.main.put::<_, Str, ByteSlice>(wtxn, SEARCHABLE_FIELDS_KEY, fields) + } + + /// Deletes the searchable fields, when no fields are specified, all fields are indexed. + pub fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, SEARCHABLE_FIELDS_KEY) + } + + /// Returns the searchable fields ids, those are the fields that are indexed, + /// if the searchable fields aren't there it means that **all** the fields are indexed. + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { + self.main.get::<_, Str, ByteSlice>(rtxn, SEARCHABLE_FIELDS_KEY) + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. From e48630da72ab947a617c131e873bbbfdf877ccef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 13:20:11 +0100 Subject: [PATCH 2/7] Introduce the searchable parameter settings to the Settings update --- src/subcommand/serve.rs | 12 +- src/update/index_documents/mod.rs | 29 +++-- src/update/index_documents/transform.rs | 53 ++++++++ src/update/settings.rs | 156 ++++++++++++++++++++++-- src/update/update_builder.rs | 15 ++- 5 files changed, 243 insertions(+), 22 deletions(-) diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index 1b2ee48de..57cc2e012 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -308,7 +308,17 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { } } - match builder.execute() { + let result = builder.execute(|count, total| { + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + processed_number_of_documents: count, + total_number_of_documents: Some(total), + } + }); + }); + + match result { Ok(_count) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 1606ecd32..7dd3b6611 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -9,8 +9,10 @@ use bstr::ByteSlice as _; use grenad::{Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; use heed::types::ByteSlice; use log::{debug, info, error}; +use memmap::Mmap; use rayon::prelude::*; use rayon::ThreadPool; + use crate::index::Index; use self::store::Store; use self::merge_function::{ @@ -248,7 +250,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { R: io::Read, F: Fn(usize, usize) + Sync, { - let before_indexing = Instant::now(); + let before_transform = Instant::now(); let transform = Transform { rtxn: &self.wtxn, @@ -268,6 +270,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { UpdateFormat::JsonStream => transform.from_json_stream(reader)?, }; + info!("Update transformed in {:.02?}", before_transform.elapsed()); + + self.execute_raw(output, progress_callback) + } + + pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> + where + F: Fn(usize, usize) + Sync + { + let before_indexing = Instant::now(); + let TransformOutput { primary_key, fields_ids_map, @@ -296,16 +309,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let _deleted_documents_count = deletion_builder.execute()?; } - let mmap = if documents_count == 0 { - None + let mmap; + let bytes = if documents_count == 0 { + &[][..] } else { - let mmap = unsafe { - memmap::Mmap::map(&documents_file).context("mmaping the transform documents file")? - }; - Some(mmap) + mmap = unsafe { Mmap::map(&documents_file).context("mmaping the transform documents file")? }; + &mmap }; - let bytes = mmap.as_ref().map(AsRef::as_ref).unwrap_or_default(); let documents = grenad::Reader::new(bytes).unwrap(); // The enum which indicates the type of the readers @@ -492,7 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } } - info!("Update processed in {:.02?}", before_indexing.elapsed()); + info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); Ok(()) } diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index ff71928cb..cb583849a 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -404,6 +404,59 @@ impl Transform<'_, '_> { documents_file, }) } + + /// Returns a `TransformOutput` with a file that contains the documents of the index + /// with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + // TODO this can be done in parallel by using the rayon `ThreadPool`. + pub fn remap_index_documents( + self, + primary_key: u8, + fields_ids_map: FieldsIdsMap, + ) -> anyhow::Result + { + let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn)?; + let documents_ids = self.index.documents_ids(self.rtxn)?; + let documents_count = documents_ids.len() as usize; + + // We create a final writer to write the new documents in order from the sorter. + let file = tempfile::tempfile()?; + let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + + let mut obkv_buffer = Vec::new(); + for result in self.index.documents.iter(self.rtxn)? { + let (docid, obkv) = result?; + let docid = docid.get(); + + obkv_buffer.clear(); + let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer); + + // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. + for (id, name) in fields_ids_map.iter() { + if let Some(val) = current_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { + obkv_writer.insert(id, val)?; + } + } + + let buffer = obkv_writer.into_inner()?; + writer.insert(docid.to_be_bytes(), buffer)?; + } + + // Once we have written all the documents, we extract + // the file and reset the seek to be able to read it again. + let mut documents_file = writer.into_inner()?; + documents_file.seek(SeekFrom::Start(0))?; + + Ok(TransformOutput { + primary_key, + fields_ids_map, + users_ids_documents_ids: users_ids_documents_ids.map_data(Cow::into_owned)?, + new_documents_ids: documents_ids, + replaced_documents_ids: RoaringBitmap::default(), + documents_count, + documents_file, + }) + } } /// Only the last value associated with an id is kept. diff --git a/src/update/settings.rs b/src/update/settings.rs index 0bfc7ff6d..b9abeb477 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -1,17 +1,45 @@ use anyhow::Context; -use crate::Index; +use grenad::CompressionType; +use rayon::ThreadPool; -pub struct Settings<'t, 'u, 'i> { +use crate::update::index_documents::{Transform, IndexDocumentsMethod}; +use crate::update::{ClearDocuments, IndexDocuments}; +use crate::{Index, FieldsIdsMap}; + +pub struct Settings<'a, 't, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - // If the field is set to `None` it means that it hasn't been set by the user, + pub(crate) log_every_n: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + pub(crate) linked_hash_map_size: Option, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) thread_pool: Option<&'a ThreadPool>, + + // If a struct field is set to `None` it means that it hasn't been set by the user, // however if it is `Some(None)` it means that the user forced a reset of the setting. + searchable_fields: Option>>, displayed_fields: Option>>, } -impl<'t, 'u, 'i> Settings<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Settings<'t, 'u, 'i> { - Settings { wtxn, index, displayed_fields: None } +impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Settings<'a, 't, 'u, 'i> { + Settings { + wtxn, + index, + log_every_n: None, + max_nb_chunks: None, + max_memory: None, + linked_hash_map_size: None, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + thread_pool: None, + searchable_fields: None, + displayed_fields: None, + } } pub fn reset_displayed_fields(&mut self) { @@ -22,8 +50,116 @@ impl<'t, 'u, 'i> Settings<'t, 'u, 'i> { self.displayed_fields = Some(Some(names)); } - pub fn execute(self) -> anyhow::Result<()> { - // Check that the displayed attributes parameters has been specified. + pub fn execute(self, progress_callback: F) -> anyhow::Result<()> + where + F: Fn(usize, usize) + Sync + { + // Check that the searchable attributes have been specified. + if let Some(value) = self.searchable_fields { + let current_searchable_fields = self.index.searchable_fields(self.wtxn)?; + let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; + let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + + let result = match value { + Some(fields_names) => { + // We create or generate the fields ids corresponding to those names. + let mut fields_ids_map = FieldsIdsMap::new(); + let mut searchable_fields = Vec::new(); + for name in fields_names { + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + searchable_fields.push(id); + } + + // We complete the new FieldsIdsMap with the previous names. + for (_id, name) in current_fields_ids_map.iter() { + fields_ids_map.insert(name).context("field id limit reached")?; + } + + // We must also update the displayed fields according to the new `FieldsIdsMap`. + let displayed_fields = match current_displayed_fields { + Some(fields) => { + let mut displayed_fields = Vec::new(); + for id in fields { + let name = current_fields_ids_map.name(*id).unwrap(); + let id = fields_ids_map.id(name).context("field id limit reached")?; + displayed_fields.push(id); + } + Some(displayed_fields) + }, + None => None, + }; + + (fields_ids_map, Some(searchable_fields), displayed_fields) + }, + None => ( + current_fields_ids_map.clone(), + current_searchable_fields.map(ToOwned::to_owned), + current_displayed_fields.map(ToOwned::to_owned), + ), + }; + + let (mut fields_ids_map, searchable_fields, displayed_fields) = result; + + let transform = Transform { + rtxn: &self.wtxn, + index: self.index, + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, + max_nb_chunks: self.max_nb_chunks, + max_memory: self.max_memory, + index_documents_method: IndexDocumentsMethod::ReplaceDocuments, + autogenerate_docids: false, + }; + + // We compute or generate the new primary key field id. + let primary_key = match self.index.primary_key(&self.wtxn)? { + Some(id) => { + let name = current_fields_ids_map.name(id).unwrap(); + fields_ids_map.insert(name).context("field id limit reached")? + }, + None => fields_ids_map.insert("id").context("field id limit reached")?, + }; + + // We remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; + + // We write the new FieldsIdsMap to the database + // this way next indexing methods will be based on that. + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + + // The new searchable fields are also written down to make sure + // that the IndexDocuments system takes only these ones into account. + match searchable_fields { + Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?, + None => self.index.delete_searchable_fields(self.wtxn).map(drop)?, + } + + // We write the displayed fields into the database here + // to make sure that the right fields are displayed. + match displayed_fields { + Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, + None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, + } + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(self.wtxn, self.index).execute()?; + + // We index the generated `TransformOutput` which must contain + // all the documents with fields in the newly defined searchable order. + let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); + indexing_builder.log_every_n = self.log_every_n; + indexing_builder.max_nb_chunks = self.max_nb_chunks; + indexing_builder.max_memory = self.max_memory; + indexing_builder.linked_hash_map_size = self.linked_hash_map_size; + indexing_builder.chunk_compression_type = self.chunk_compression_type; + indexing_builder.chunk_compression_level = self.chunk_compression_level; + indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + indexing_builder.thread_pool = self.thread_pool; + indexing_builder.execute_raw(output, progress_callback)?; + } + + // Check that the displayed attributes have been specified. if let Some(value) = self.displayed_fields { match value { // If it has been set, and it was a list of fields names, we create @@ -99,7 +235,7 @@ mod tests { // In the same transaction we change the displayed fields to be only the age. let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["age".into()]); - builder.execute().unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to only the "age" field. @@ -114,7 +250,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index); builder.reset_displayed_fields(); - builder.execute().unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index a4a70be91..67ea04bfc 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -103,8 +103,19 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Settings<'t, 'u, 'i> + ) -> Settings<'a, 't, 'u, 'i> { - Settings::new(wtxn, index) + let mut builder = Settings::new(wtxn, index); + + builder.log_every_n = self.log_every_n; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.linked_hash_map_size = self.linked_hash_map_size; + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.thread_pool = self.thread_pool; + + builder } } From 649fb6e40145b3415a1222c0b7d4dbc212327956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 13:42:29 +0100 Subject: [PATCH 3/7] Make sure that the indexing Store only index searchable fields --- src/update/index_documents/mod.rs | 7 +++++ src/update/index_documents/store.rs | 48 +++++++++++++++++------------ src/update/settings.rs | 11 +++++-- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 7dd3b6611..82582dac8 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::sync::mpsc::sync_channel; @@ -327,6 +328,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { WordsPairsProximitiesDocids, } + let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { + Some(fields) => fields.iter().copied().collect(), + None => fields_ids_map.iter().map(|(id, _name)| id).collect(), + }; + let linked_hash_map_size = self.linked_hash_map_size; let max_nb_chunks = self.max_nb_chunks; let max_memory = self.max_memory; @@ -354,6 +360,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( + searchable_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 7c1896ee5..43e37d9cd 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::iter::FromIterator; @@ -37,6 +37,9 @@ pub struct Readers { } pub struct Store { + // Indexing parameters + searchable_fields: HashSet, + // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, @@ -56,6 +59,7 @@ pub struct Store { impl Store { pub fn new( + searchable_fields: HashSet, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -101,18 +105,22 @@ impl Store { })?; Ok(Store { + // Indexing parameters. + searchable_fields, + // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, + // MTBL parameters chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, - + // MTBL sorters main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, - + // MTBL writers docid_word_positions_writer, documents_writer, }) @@ -309,23 +317,25 @@ impl Store { } for (attr, content) in document.iter() { - use serde_json::Value; - let content: Cow = match serde_json::from_slice(content) { - Ok(string) => string, - Err(_) => match serde_json::from_slice(content)? { - Value::Null => continue, - Value::Bool(boolean) => Cow::Owned(boolean.to_string()), - Value::Number(number) => Cow::Owned(number.to_string()), - Value::String(string) => Cow::Owned(string), - Value::Array(_array) => continue, - Value::Object(_object) => continue, - } - }; + if self.searchable_fields.contains(&attr) { + use serde_json::Value; + let content: Cow = match serde_json::from_slice(content) { + Ok(string) => string, + Err(_) => match serde_json::from_slice(content)? { + Value::Null => continue, + Value::Bool(boolean) => Cow::Owned(boolean.to_string()), + Value::Number(number) => Cow::Owned(number.to_string()), + Value::String(string) => Cow::Owned(string), + Value::Array(_array) => continue, + Value::Object(_object) => continue, + } + }; - for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + } } } diff --git a/src/update/settings.rs b/src/update/settings.rs index b9abeb477..f9265d976 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -42,6 +42,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_searchable_fields(&mut self) { + self.searchable_fields = Some(None); + } + + pub fn set_searchable_fields(&mut self, names: Vec) { + self.searchable_fields = Some(Some(names)); + } + pub fn reset_displayed_fields(&mut self) { self.displayed_fields = Some(None); } @@ -56,7 +64,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { { // Check that the searchable attributes have been specified. if let Some(value) = self.searchable_fields { - let current_searchable_fields = self.index.searchable_fields(self.wtxn)?; let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -93,7 +100,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }, None => ( current_fields_ids_map.clone(), - current_searchable_fields.map(ToOwned::to_owned), + None, current_displayed_fields.map(ToOwned::to_owned), ), }; From a20c871ece0a9a930e42046391e4fcfb945ad91e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 18:22:40 +0100 Subject: [PATCH 4/7] Add more tests to the Settings searchable attributes operation --- src/update/settings.rs | 103 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/update/settings.rs b/src/update/settings.rs index f9265d976..96ff44857 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -203,6 +203,109 @@ mod tests { use crate::update::{IndexDocuments, UpdateFormat}; use heed::EnvOpenOptions; + #[test] + fn set_and_reset_searchable_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // We change the searchable fields to be the "name" field only. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec!["name".into()]); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the searchable field is correctly set to "name" only. + let rtxn = index.read_txn().unwrap(); + // When we search for something that is not in + // the searchable fields it must not return any document. + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + + // When we search for something that is in the searchable fields + // we must find the appropriate document. + let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + assert_eq!(documents.len(), 1); + assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.reset_searchable_fields(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the searchable field have been reset and documents are found now. + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + assert_eq!(searchable_fields, None); + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + drop(rtxn); + } + + #[test] + fn mixup_searchable_with_displayed_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // In the same transaction we change the displayed fields to be only the "age". + // We also change the searchable fields to be the "name" field only. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_displayed_fields(vec!["age".into()]); + builder.set_searchable_fields(vec!["name".into()]); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + let age_id = fields_ids_map.id("age").unwrap(); + assert_eq!(fields_ids, Some(&[age_id][..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.reset_searchable_fields(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields always contains only the "age" field. + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + let age_id = fields_ids_map.id("age").unwrap(); + assert_eq!(fields_ids, Some(&[age_id][..])); + drop(rtxn); + } + #[test] fn default_displayed_fields() { let path = tempfile::tempdir().unwrap(); From 63f65bac3ec67b704237d2dce23941f278ee6d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 19:12:00 +0100 Subject: [PATCH 5/7] Ignore the long running UpdateStore test --- src/update/update_store.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/update/update_store.rs b/src/update/update_store.rs index 448a0ed95..4723f0b68 100644 --- a/src/update/update_store.rs +++ b/src/update/update_store.rs @@ -220,6 +220,7 @@ mod tests { } #[test] + #[ignore] fn long_running_update() { let dir = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); From 01c4f5abcd3ebb28d4a504e770d78de5230e9ba4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 19:35:55 +0100 Subject: [PATCH 6/7] Introduce the searchable attributes setting to the settings route --- src/subcommand/serve.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index 57cc2e012..dd10607f2 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -184,6 +184,13 @@ struct Settings { skip_serializing_if = "Option::is_none", )] displayed_attributes: Option>>, + + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none", + )] + searchable_attributes: Option>>, } // Any value that is present is considered Some value, including null. @@ -300,6 +307,14 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let mut wtxn = index_cloned.write_txn()?; let mut builder = update_builder.settings(&mut wtxn, &index_cloned); + // We transpose the settings JSON struct into a real setting update. + if let Some(names) = settings.searchable_attributes { + match names { + Some(names) => builder.set_searchable_fields(names), + None => builder.reset_searchable_fields(), + } + } + // We transpose the settings JSON struct into a real setting update. if let Some(names) = settings.displayed_attributes { match names { From a31db33e93a51347a3f17fcc8e14a39a646d83b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 19:59:09 +0100 Subject: [PATCH 7/7] Introduce an optimization when the searchable attributes are ordered --- src/update/settings.rs | 70 ++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/src/update/settings.rs b/src/update/settings.rs index 96ff44857..427f0e926 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -69,34 +69,50 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let result = match value { Some(fields_names) => { - // We create or generate the fields ids corresponding to those names. - let mut fields_ids_map = FieldsIdsMap::new(); - let mut searchable_fields = Vec::new(); - for name in fields_names { - let id = fields_ids_map.insert(&name).context("field id limit reached")?; - searchable_fields.push(id); + let mut fields_ids_map = current_fields_ids_map.clone(); + let searchable_fields: Vec<_> = + fields_names.iter() + .map(|name| fields_ids_map.insert(name)) + .collect::>>() + .context("field id limit reached")?; + + // If the searchable fields are ordered we don't have to generate a new `FieldsIdsMap`. + if searchable_fields.windows(2).all(|win| win[0] < win[1]) { + ( + fields_ids_map, + Some(searchable_fields), + current_displayed_fields.map(ToOwned::to_owned), + ) + } else { + // We create or generate the fields ids corresponding to those names. + let mut fields_ids_map = FieldsIdsMap::new(); + let mut searchable_fields = Vec::new(); + for name in fields_names { + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + searchable_fields.push(id); + } + + // We complete the new FieldsIdsMap with the previous names. + for (_id, name) in current_fields_ids_map.iter() { + fields_ids_map.insert(name).context("field id limit reached")?; + } + + // We must also update the displayed fields according to the new `FieldsIdsMap`. + let displayed_fields = match current_displayed_fields { + Some(fields) => { + let mut displayed_fields = Vec::new(); + for id in fields { + let name = current_fields_ids_map.name(*id).unwrap(); + let id = fields_ids_map.id(name).context("field id limit reached")?; + displayed_fields.push(id); + } + Some(displayed_fields) + }, + None => None, + }; + + (fields_ids_map, Some(searchable_fields), displayed_fields) } - - // We complete the new FieldsIdsMap with the previous names. - for (_id, name) in current_fields_ids_map.iter() { - fields_ids_map.insert(name).context("field id limit reached")?; - } - - // We must also update the displayed fields according to the new `FieldsIdsMap`. - let displayed_fields = match current_displayed_fields { - Some(fields) => { - let mut displayed_fields = Vec::new(); - for id in fields { - let name = current_fields_ids_map.name(*id).unwrap(); - let id = fields_ids_map.id(name).context("field id limit reached")?; - displayed_fields.push(id); - } - Some(displayed_fields) - }, - None => None, - }; - - (fields_ids_map, Some(searchable_fields), displayed_fields) }, None => ( current_fields_ids_map.clone(),