Make sure that the indexing Store only index searchable fields

This commit is contained in:
Clément Renault 2020-11-03 13:42:29 +01:00
parent e48630da72
commit 649fb6e401
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 45 additions and 21 deletions

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom};
use std::sync::mpsc::sync_channel;
@ -327,6 +328,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
WordsPairsProximitiesDocids,
}
let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? {
Some(fields) => fields.iter().copied().collect(),
None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
};
let linked_hash_map_size = self.linked_hash_map_size;
let max_nb_chunks = self.max_nb_chunks;
let max_memory = self.max_memory;
@ -354,6 +360,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
.enumerate()
.map(|(i, documents)| {
let store = Store::new(
searchable_fields.clone(),
linked_hash_map_size,
max_nb_chunks,
max_memory_by_job,

View File

@ -1,5 +1,5 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::collections::{BTreeMap, HashMap, HashSet};
use std::convert::{TryFrom, TryInto};
use std::fs::File;
use std::iter::FromIterator;
@ -37,6 +37,9 @@ pub struct Readers {
}
pub struct Store {
// Indexing parameters
searchable_fields: HashSet<u8>,
// Caches
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
@ -56,6 +59,7 @@ pub struct Store {
impl Store {
pub fn new(
searchable_fields: HashSet<u8>,
linked_hash_map_size: Option<usize>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
@ -101,18 +105,22 @@ impl Store {
})?;
Ok(Store {
// Indexing parameters.
searchable_fields,
// Caches
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
word_docids_limit: linked_hash_map_size,
words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size),
words_pairs_proximities_docids_limit: linked_hash_map_size,
// MTBL parameters
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
// MTBL sorters
main_sorter,
word_docids_sorter,
words_pairs_proximities_docids_sorter,
// MTBL writers
docid_word_positions_writer,
documents_writer,
})
@ -309,23 +317,25 @@ impl Store {
}
for (attr, content) in document.iter() {
use serde_json::Value;
let content: Cow<str> = match serde_json::from_slice(content) {
Ok(string) => string,
Err(_) => match serde_json::from_slice(content)? {
Value::Null => continue,
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
Value::Number(number) => Cow::Owned(number.to_string()),
Value::String(string) => Cow::Owned(string),
Value::Array(_array) => continue,
Value::Object(_object) => continue,
}
};
if self.searchable_fields.contains(&attr) {
use serde_json::Value;
let content: Cow<str> = match serde_json::from_slice(content) {
Ok(string) => string,
Err(_) => match serde_json::from_slice(content)? {
Value::Null => continue,
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
Value::Number(number) => Cow::Owned(number.to_string()),
Value::String(string) => Cow::Owned(string),
Value::Array(_array) => continue,
Value::Object(_object) => continue,
}
};
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
}
}

View File

@ -42,6 +42,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
pub fn reset_searchable_fields(&mut self) {
self.searchable_fields = Some(None);
}
pub fn set_searchable_fields(&mut self, names: Vec<String>) {
self.searchable_fields = Some(Some(names));
}
pub fn reset_displayed_fields(&mut self) {
self.displayed_fields = Some(None);
}
@ -56,7 +64,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
{
// Check that the searchable attributes have been specified.
if let Some(value) = self.searchable_fields {
let current_searchable_fields = self.index.searchable_fields(self.wtxn)?;
let current_displayed_fields = self.index.displayed_fields(self.wtxn)?;
let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
@ -93,7 +100,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
},
None => (
current_fields_ids_map.clone(),
current_searchable_fields.map(ToOwned::to_owned),
None,
current_displayed_fields.map(ToOwned::to_owned),
),
};