Make sure that the indexing Store only index searchable fields

This commit is contained in:
Clément Renault 2020-11-03 13:42:29 +01:00
parent e48630da72
commit 649fb6e401
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 45 additions and 21 deletions

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::{self, Seek, SeekFrom}; use std::io::{self, Seek, SeekFrom};
use std::sync::mpsc::sync_channel; use std::sync::mpsc::sync_channel;
@ -327,6 +328,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
WordsPairsProximitiesDocids, WordsPairsProximitiesDocids,
} }
let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? {
Some(fields) => fields.iter().copied().collect(),
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
};
let linked_hash_map_size = self.linked_hash_map_size; let linked_hash_map_size = self.linked_hash_map_size;
let max_nb_chunks = self.max_nb_chunks; let max_nb_chunks = self.max_nb_chunks;
let max_memory = self.max_memory; let max_memory = self.max_memory;
@ -354,6 +360,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.enumerate() .enumerate()
.map(|(i, documents)| { .map(|(i, documents)| {
let store = Store::new( let store = Store::new(
searchable_fields.clone(),
linked_hash_map_size, linked_hash_map_size,
max_nb_chunks, max_nb_chunks,
max_memory_by_job, max_memory_by_job,

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
use std::fs::File; use std::fs::File;
use std::iter::FromIterator; use std::iter::FromIterator;
@ -37,6 +37,9 @@ pub struct Readers {
} }
pub struct Store { pub struct Store {
// Indexing parameters
searchable_fields: HashSet<u8>,
// Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize, word_docids_limit: usize,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
@ -56,6 +59,7 @@ pub struct Store {
impl Store { impl Store {
pub fn new( pub fn new(
searchable_fields: HashSet<u8>,
linked_hash_map_size: Option<usize>, linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
max_memory: Option<usize>, max_memory: Option<usize>,
@ -101,18 +105,22 @@ impl Store {
})?; })?;
Ok(Store { Ok(Store {
// Indexing parameters.
searchable_fields,
// Caches
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
word_docids_limit: linked_hash_map_size, word_docids_limit: linked_hash_map_size,
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
words_pairs_proximities_docids_limit: linked_hash_map_size, words_pairs_proximities_docids_limit: linked_hash_map_size,
// MTBL parameters
chunk_compression_type, chunk_compression_type,
chunk_compression_level, chunk_compression_level,
chunk_fusing_shrink_size, chunk_fusing_shrink_size,
// MTBL sorters
main_sorter, main_sorter,
word_docids_sorter, word_docids_sorter,
words_pairs_proximities_docids_sorter, words_pairs_proximities_docids_sorter,
// MTBL writers
docid_word_positions_writer, docid_word_positions_writer,
documents_writer, documents_writer,
}) })
@ -309,23 +317,25 @@ impl Store {
} }
for (attr, content) in document.iter() { for (attr, content) in document.iter() {
use serde_json::Value; if self.searchable_fields.contains(&attr) {
let content: Cow<str> = match serde_json::from_slice(content) { use serde_json::Value;
Ok(string) => string, let content: Cow<str> = match serde_json::from_slice(content) {
Err(_) => match serde_json::from_slice(content)? { Ok(string) => string,
Value::Null => continue, Err(_) => match serde_json::from_slice(content)? {
Value::Bool(boolean) => Cow::Owned(boolean.to_string()), Value::Null => continue,
Value::Number(number) => Cow::Owned(number.to_string()), Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
Value::String(string) => Cow::Owned(string), Value::Number(number) => Cow::Owned(number.to_string()),
Value::Array(_array) => continue, Value::String(string) => Cow::Owned(string),
Value::Object(_object) => continue, Value::Array(_array) => continue,
} Value::Object(_object) => continue,
}; }
};
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase(); let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32; let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
} }
} }

View File

@ -42,6 +42,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
} }
pub fn reset_searchable_fields(&mut self) {
self.searchable_fields = Some(None);
}
pub fn set_searchable_fields(&mut self, names: Vec<String>) {
self.searchable_fields = Some(Some(names));
}
pub fn reset_displayed_fields(&mut self) { pub fn reset_displayed_fields(&mut self) {
self.displayed_fields = Some(None); self.displayed_fields = Some(None);
} }
@ -56,7 +64,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
{ {
// Check that the searchable attributes have been specified. // Check that the searchable attributes have been specified.
if let Some(value) = self.searchable_fields { if let Some(value) = self.searchable_fields {
let current_searchable_fields = self.index.searchable_fields(self.wtxn)?;
let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; let current_displayed_fields = self.index.displayed_fields(self.wtxn)?;
let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
@ -93,7 +100,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}, },
None => ( None => (
current_fields_ids_map.clone(), current_fields_ids_map.clone(),
current_searchable_fields.map(ToOwned::to_owned), None,
current_displayed_fields.map(ToOwned::to_owned), current_displayed_fields.map(ToOwned::to_owned),
), ),
}; };