Fix settings bug

replace ids with str in settings

This allows for better maintainability of the settings code, since
updating the searchable attributes is now straightforward.

criterion use string

fix reindexing fieldid remaping

add tests for primary_key compute

fix tests

fix http-ui

fixup! add tests for primary_key compute

code improvements settings

update deps

fixup! code improvements settings

fixup! refactor settings updates and fix bug

fixup! Fix settings bug

fixup! Fix settings bug

fixup! Fix settings bug

Update src/update/index_documents/transform.rs

Co-authored-by: Clément Renault <clement@meilisearch.com>

fixup! Fix settings bug
This commit is contained in:
mpostma 2021-01-20 17:27:43 +01:00
parent 26f060f66b
commit 87a56d2bc9
No known key found for this signature in database
GPG Key ID: CBC8A7C1D7A28C3A
15 changed files with 1028 additions and 878 deletions

481
Cargo.lock generated

File diff suppressed because it is too large Load Diff

632
http-ui/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::fmt::Display; use std::fmt::Display;
use std::fs::{File, create_dir_all}; use std::fs::{File, create_dir_all};
@ -654,13 +653,13 @@ async fn main() -> anyhow::Result<()> {
let mut documents = Vec::new(); let mut documents = Vec::new();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() {
Some(fields) => Cow::Borrowed(fields), Some(fields) => fields,
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), None => fields_ids_map.iter().map(|(id, _)| id).collect(),
}; };
let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() { let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() {
Some(fields) => fields.iter().flat_map(|id| fields_ids_map.name(*id)).map(ToOwned::to_owned).collect(), Some(fields) => fields.into_iter().map(String::from).collect(),
None => fields_ids_map.iter().map(|(_, name)| name).map(ToOwned::to_owned).collect(), None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(),
}; };
let stop_words = fst::Set::default(); let stop_words = fst::Set::default();
@ -690,9 +689,9 @@ async fn main() -> anyhow::Result<()> {
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let displayed_fields = match index.displayed_fields(&rtxn).unwrap() { let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() {
Some(fields) => Cow::Borrowed(fields), Some(fields) => fields,
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), None => fields_ids_map.iter().map(|(id, _)| id).collect(),
}; };
match external_documents_ids.get(&id) { match external_documents_ids.get(&id) {

View File

@ -1,10 +1,12 @@
use crate::{FieldsIdsMap, FieldId}; use std::collections::HashMap;
use anyhow::{Context, bail}; use anyhow::{Context, bail};
use regex::Regex; use regex::Regex;
use serde::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
#[derive(Debug, Serialize, Deserialize, Copy, Clone, PartialEq, Eq)] use crate::facet::FacetType;
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub enum Criterion { pub enum Criterion {
/// Sorted by increasing number of typos. /// Sorted by increasing number of typos.
Typo, Typo,
@ -21,13 +23,13 @@ pub enum Criterion {
/// Sorted by the similarity of the matched words with the query words. /// Sorted by the similarity of the matched words with the query words.
Exactness, Exactness,
/// Sorted by the increasing value of the field specified. /// Sorted by the increasing value of the field specified.
Asc(FieldId), Asc(String),
/// Sorted by the decreasing value of the field specified. /// Sorted by the decreasing value of the field specified.
Desc(FieldId), Desc(String),
} }
impl Criterion { impl Criterion {
pub fn from_str(fields_ids_map: &mut FieldsIdsMap, txt: &str) -> anyhow::Result<Criterion> { pub fn from_str(faceted_attributes: &HashMap<String, FacetType>, txt: &str) -> anyhow::Result<Criterion> {
match txt { match txt {
"typo" => Ok(Criterion::Typo), "typo" => Ok(Criterion::Typo),
"words" => Ok(Criterion::Words), "words" => Ok(Criterion::Words),
@ -40,22 +42,15 @@ impl Criterion {
let caps = re.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let caps = re.captures(text).with_context(|| format!("unknown criterion name: {}", text))?;
let order = caps.get(1).unwrap().as_str(); let order = caps.get(1).unwrap().as_str();
let field_name = caps.get(2).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str();
let field_id = fields_ids_map.insert(field_name).context("field id limit reached")?; faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?;
match order { match order {
"asc" => Ok(Criterion::Asc(field_id)), "asc" => Ok(Criterion::Asc(field_name.to_string())),
"desc" => Ok(Criterion::Desc(field_id)), "desc" => Ok(Criterion::Desc(field_name.to_string())),
otherwise => bail!("unknown criterion name: {}", otherwise), otherwise => bail!("unknown criterion name: {}", otherwise),
} }
}, },
} }
} }
pub fn field_id(&self) -> Option<FieldId> {
match *self {
Criterion::Asc(fid) | Criterion::Desc(fid) => Some(fid),
_ => None,
}
}
} }
pub fn default_criteria() -> Vec<Criterion> { pub fn default_criteria() -> Vec<Criterion> {

View File

@ -112,8 +112,8 @@ impl Index {
/* primary key */ /* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id. /// Writes the documents primary key, this is the field name that is used to store the id.
pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: FieldId) -> heed::Result<()> { pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> {
self.main.put::<_, Str, OwnedType<FieldId>>(wtxn, PRIMARY_KEY_KEY, &primary_key) self.main.put::<_, Str, Str>(wtxn, PRIMARY_KEY_KEY, &primary_key)
} }
/// Deletes the primary key of the documents, this can be done to reset indexes settings. /// Deletes the primary key of the documents, this can be done to reset indexes settings.
@ -122,8 +122,8 @@ impl Index {
} }
/// Returns the documents primary key, `None` if it hasn't been defined. /// Returns the documents primary key, `None` if it hasn't been defined.
pub fn primary_key(&self, rtxn: &RoTxn) -> heed::Result<Option<FieldId>> { pub fn primary_key<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<&'t str>> {
self.main.get::<_, Str, OwnedType<FieldId>>(rtxn, PRIMARY_KEY_KEY) self.main.get::<_, Str, Str>(rtxn, PRIMARY_KEY_KEY)
} }
/* external documents ids */ /* external documents ids */
@ -175,10 +175,10 @@ impl Index {
/* displayed fields */ /* displayed fields */
/// Writes the fields ids that must be displayed in the defined order. /// Writes the fields that must be displayed in the defined order.
/// There must be not be any duplicate field id. /// There must be not be any duplicate field id.
pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[FieldId]) -> heed::Result<()> { pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, DISPLAYED_FIELDS_KEY, fields) self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, DISPLAYED_FIELDS_KEY, &fields)
} }
/// Deletes the displayed fields ids, this will make the engine to display /// Deletes the displayed fields ids, this will make the engine to display
@ -187,18 +187,27 @@ impl Index {
self.main.delete::<_, Str>(wtxn, DISPLAYED_FIELDS_KEY) self.main.delete::<_, Str>(wtxn, DISPLAYED_FIELDS_KEY)
} }
/// Returns the displayed fields ids in the order they must be returned. If it returns /// Returns the displayed fields in the order they were set by the user. If it returns
/// `None` it means that all the attributes are displayed in the order of the `FieldsIdsMap`. /// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`.
pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<&'t [FieldId]>> { pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<Vec<&'t str>>> {
self.main.get::<_, Str, ByteSlice>(rtxn, DISPLAYED_FIELDS_KEY) self.main.get::<_, Str, SerdeBincode<Vec<&'t str>>>(rtxn, DISPLAYED_FIELDS_KEY)
}
pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<Option<Vec<FieldId>>> {
let fields_ids_map = self.fields_ids_map(rtxn)?;
let ids = self.displayed_fields(rtxn)?
.map(|fields| fields
.into_iter()
.map(|name| fields_ids_map.id(name).expect("Field not found"))
.collect::<Vec<_>>());
Ok(ids)
} }
/* searchable fields */ /* searchable fields */
/// Writes the searchable fields, when this list is specified, only these are indexed. /// Writes the searchable fields, when this list is specified, only these are indexed.
pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[FieldId]) -> heed::Result<()> { pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
assert!(fields.windows(2).all(|win| win[0] < win[1])); // is sorted self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, SEARCHABLE_FIELDS_KEY, &fields)
self.main.put::<_, Str, ByteSlice>(wtxn, SEARCHABLE_FIELDS_KEY, fields)
} }
/// Deletes the searchable fields, when no fields are specified, all fields are indexed. /// Deletes the searchable fields, when no fields are specified, all fields are indexed.
@ -206,17 +215,36 @@ impl Index {
self.main.delete::<_, Str>(wtxn, SEARCHABLE_FIELDS_KEY) self.main.delete::<_, Str>(wtxn, SEARCHABLE_FIELDS_KEY)
} }
/// Returns the searchable fields ids, those are the fields that are indexed, /// Returns the searchable fields, those are the fields that are indexed,
/// if the searchable fields aren't there it means that **all** the fields are indexed. /// if the searchable fields aren't there it means that **all** the fields are indexed.
pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<&'t [FieldId]>> { pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<Option<Vec<&'t str>>> {
self.main.get::<_, Str, ByteSlice>(rtxn, SEARCHABLE_FIELDS_KEY) self.main.get::<_, Str, SerdeBincode<Vec<&'t str>>>(rtxn, SEARCHABLE_FIELDS_KEY)
}
/// Identical to `searchable_fields`, but returns the ids instead.
pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<Option<Vec<FieldId>>> {
match self.searchable_fields(rtxn)? {
Some(names) => {
let fields_map = self.fields_ids_map(rtxn)?;
let mut ids = Vec::new();
for name in names {
let id = fields_map
.id(name)
.ok_or_else(|| format!("field id map must contain {:?}", name))
.expect("corrupted data: ");
ids.push(id);
}
Ok(Some(ids))
}
None => Ok(None),
}
} }
/* faceted fields */ /* faceted fields */
/// Writes the facet fields ids associated with their facet type or `None` if /// Writes the facet fields associated with their facet type or `None` if
/// the facet type is currently unknown. /// the facet type is currently unknown.
pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<FieldId, FacetType>) -> heed::Result<()> { pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<String, FacetType>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types)
} }
@ -225,9 +253,26 @@ impl Index {
self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY)
} }
/// Returns the facet fields ids associated with their facet type. /// Returns the facet fields names associated with their facet type.
pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result<HashMap<FieldId, FacetType>> { pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, FacetType>> {
Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default())
}
/// Same as `faceted_fields`, but returns ids instead.
pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashMap<FieldId, FacetType>> {
let faceted_fields = self.faceted_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?;
let faceted_fields = faceted_fields
.iter()
.map(|(k, v)| {
let kid = fields_ids_map
.id(k)
.ok_or_else(|| format!("{:?} should be present in the field id map", k))
.expect("corrupted data: ");
(kid, *v)
})
.collect();
Ok(faceted_fields)
} }
/* faceted documents ids */ /* faceted documents ids */

View File

@ -148,7 +148,7 @@ impl FacetCondition {
) -> anyhow::Result<FacetCondition> ) -> anyhow::Result<FacetCondition>
{ {
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?; let faceted_fields = index.faceted_fields_ids(rtxn)?;
let lexed = FilterParser::parse(Rule::prgm, expression)?; let lexed = FilterParser::parse(Rule::prgm, expression)?;
FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed)
} }
@ -552,15 +552,15 @@ mod tests {
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::equal("Ponce")); let expected = OperatorString(0, FacetStringOperator::equal("Ponce"));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); let expected = OperatorString(0, FacetStringOperator::not_equal("ponce"));
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }
@ -581,13 +581,13 @@ mod tests {
// Test that the facet condition is correctly generated. // Test that the facet condition is correctly generated.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
let expected = OperatorI64(1, Between(22, 44)); let expected = OperatorI64(0, Between(22, 44));
assert_eq!(condition, expected); assert_eq!(condition, expected);
let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
let expected = Or( let expected = Or(
Box::new(OperatorI64(1, LowerThan(22))), Box::new(OperatorI64(0, LowerThan(22))),
Box::new(OperatorI64(1, GreaterThan(44))), Box::new(OperatorI64(0, GreaterThan(44))),
); );
assert_eq!(condition, expected); assert_eq!(condition, expected);
} }

View File

@ -285,9 +285,13 @@ impl<'a> Search<'a> {
} }
}).next(); }).next();
match result { match result {
Some((fid, is_ascending)) => { Some((attr_name, is_ascending)) => {
let faceted_fields = self.index.faceted_fields(self.rtxn)?; let field_id_map = self.index.fields_ids_map(self.rtxn)?;
let ftype = *faceted_fields.get(&fid).context("unknown field id")?; let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?;
let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?;
let ftype = *faceted_fields.get(&fid)
.with_context(|| format!("{:?} not found in the faceted fields.", attr_name))
.expect("corrupted data: ");
Some((fid, ftype, is_ascending)) Some((fid, ftype, is_ascending))
}, },
None => None, None => None,

View File

@ -342,7 +342,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
} }
let faceted_fields = index.faceted_fields(rtxn)?; let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields { for (field_id, field_type) in faceted_fields {
let facet_name = fields_ids_map.name(field_id).unwrap(); let facet_name = fields_ids_map.name(field_id).unwrap();
@ -413,7 +413,7 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin
fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;
@ -451,7 +451,7 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;
let field_id = fields_ids_map.id(&field_name) let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?; .with_context(|| format!("field {} not found", field_name))?;

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{self, BufRead, Write}; use std::io::{self, BufRead, Write};
use std::iter::once; use std::iter::once;
use std::path::PathBuf; use std::path::PathBuf;
@ -47,9 +46,9 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let index = Index::new(options, &opt.database)?; let index = Index::new(options, &opt.database)?;
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let displayed_fields = match index.displayed_fields(&rtxn)? { let displayed_fields = match index.displayed_fields_ids(&rtxn)? {
Some(fields) => Cow::Borrowed(fields), Some(fields) => fields,
None => Cow::Owned(fields_ids_map.iter().map(|(id, _)| id).collect()), None => fields_ids_map.iter().map(|(id, _)| id).collect(),
}; };
let stdin = io::stdin(); let stdin = io::stdin();

View File

@ -25,7 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We retrieve the number of documents ids that we are deleting. // We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?; let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// We clean some of the main engine datastructures. // We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_fst(self.wtxn, &fst::Set::default())?;

View File

@ -188,7 +188,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter); drop(iter);
// Remove the documents ids from the faceted documents ids. // Remove the documents ids from the faceted documents ids.
let faceted_fields = self.index.faceted_fields(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
for (field_id, facet_type) in faceted_fields { for (field_id, facet_type) in faceted_fields {
let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?;
docids.difference_with(&self.documents_ids); docids.difference_with(&self.documents_ids);

View File

@ -51,7 +51,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
pub fn execute(self) -> anyhow::Result<()> { pub fn execute(self) -> anyhow::Result<()> {
// We get the faceted fields to be able to create the facet levels. // We get the faceted fields to be able to create the facet levels.
let faceted_fields = self.index.faceted_fields(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
debug!("Computing and writing the facet values levels docids into LMDB on disk..."); debug!("Computing and writing the facet values levels docids into LMDB on disk...");
for (field_id, facet_type) in faceted_fields { for (field_id, facet_type) in faceted_fields {

View File

@ -338,8 +338,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
FacetLevel0ValuesDocids, FacetLevel0ValuesDocids,
} }
let faceted_fields = self.index.faceted_fields(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? {
Some(fields) => fields.iter().copied().collect(), Some(fields) => fields.iter().copied().collect(),
None => fields_ids_map.iter().map(|(id, _name)| id).collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(),
}; };
@ -485,7 +485,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
// We write the primary key field id into the main database // We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, primary_key)?; self.index.put_primary_key(self.wtxn, &primary_key)?;
// We write the external documents ids into the main database. // We write the external documents ids into the main database.
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;

View File

@ -10,13 +10,15 @@ use log::info;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use crate::{BEU32, MergeFn, Index, FieldId, FieldsIdsMap, ExternalDocumentsIds}; use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use super::merge_function::merge_two_obkvs; use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod}; use super::{create_writer, create_sorter, IndexDocumentsMethod};
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
pub struct TransformOutput { pub struct TransformOutput {
pub primary_key: FieldId, pub primary_key: String,
pub fields_ids_map: FieldsIdsMap, pub fields_ids_map: FieldsIdsMap,
pub external_documents_ids: ExternalDocumentsIds<'static>, pub external_documents_ids: ExternalDocumentsIds<'static>,
pub new_documents_ids: RoaringBitmap, pub new_documents_ids: RoaringBitmap,
@ -73,7 +75,6 @@ impl Transform<'_, '_> {
{ {
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let primary_key = self.index.primary_key(self.rtxn)?;
// Deserialize the whole batch of documents in memory. // Deserialize the whole batch of documents in memory.
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream { let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
@ -88,27 +89,15 @@ impl Transform<'_, '_> {
}; };
// We extract the primary key from the first document in // We extract the primary key from the first document in
// the batch if it hasn't already been defined in the index. // the batch if it hasn't already been defined in the index
let primary_key = match primary_key {
Some(primary_key) => primary_key,
None => {
// We ignore a potential error here as we can't early return it now,
// the peek method gives us only a reference on the next item,
// we will eventually return it in the iteration just after.
let first = documents.peek().and_then(|r| r.as_ref().ok()); let first = documents.peek().and_then(|r| r.as_ref().ok());
match first.and_then(|doc| doc.keys().find(|k| k.contains("id"))) { let alternative_name = first.and_then(|doc| doc.keys().find(|k| k.contains(DEFAULT_PRIMARY_KEY_NAME)).cloned());
Some(key) => fields_ids_map.insert(&key).context("field id limit reached")?, let (primary_key_id, primary_key) = compute_primary_key_pair(
None => { self.index.primary_key(self.rtxn)?,
if !self.autogenerate_docids { &mut fields_ids_map,
// If there is no primary key in the current document batch, we must alternative_name,
// return an error and not automatically generate any document id. self.autogenerate_docids
return Err(anyhow!("missing primary key")) )?;
}
fields_ids_map.insert("id").context("field id limit reached")?
},
}
},
};
if documents.peek().is_none() { if documents.peek().is_none() {
return Ok(TransformOutput { return Ok(TransformOutput {
@ -122,13 +111,6 @@ impl Transform<'_, '_> {
}); });
} }
// Get the primary key field name now, this way we will
// be able to get the value in the JSON Map document.
let primary_key_name = fields_ids_map
.name(primary_key)
.expect("found the primary key name")
.to_owned();
// We must choose the appropriate merge function for when two or more documents // We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch. // with the same user id must be merged or fully replaced in the same batch.
let merge_function = match self.index_documents_method { let merge_function = match self.index_documents_method {
@ -170,7 +152,7 @@ impl Transform<'_, '_> {
// We retrieve the user id from the document based on the primary key name, // We retrieve the user id from the document based on the primary key name,
// if the document id isn't present we generate a uuid. // if the document id isn't present we generate a uuid.
let external_id = match document.get(&primary_key_name) { let external_id = match document.get(&primary_key) {
Some(value) => match value { Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()), Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()), Value::Number(number) => Cow::Owned(number.to_string()),
@ -196,7 +178,7 @@ impl Transform<'_, '_> {
serde_json::to_writer(&mut json_buffer, value)?; serde_json::to_writer(&mut json_buffer, value)?;
writer.insert(field_id, &json_buffer)?; writer.insert(field_id, &json_buffer)?;
} }
else if field_id == primary_key { else if field_id == primary_key_id {
// We validate the document id [a-zA-Z0-9\-_]. // We validate the document id [a-zA-Z0-9\-_].
let external_id = match validate_document_id(&external_id) { let external_id = match validate_document_id(&external_id) {
Some(valid) => valid, Some(valid) => valid,
@ -240,42 +222,37 @@ impl Transform<'_, '_> {
let mut csv = csv::Reader::from_reader(reader); let mut csv = csv::Reader::from_reader(reader);
let headers = csv.headers()?; let headers = csv.headers()?;
let primary_key = self.index.primary_key(self.rtxn)?;
// Generate the new fields ids based on the current fields ids and this CSV headers.
let mut fields_ids = Vec::new(); let mut fields_ids = Vec::new();
// Generate the new fields ids based on the current fields ids and this CSV headers.
for (i, header) in headers.iter().enumerate() { for (i, header) in headers.iter().enumerate() {
let id = fields_ids_map.insert(header).context("field id limit reached)")?; let id = fields_ids_map.insert(header).context("field id limit reached)")?;
fields_ids.push((id, i)); fields_ids.push((id, i));
} }
// Extract the position of the primary key in the current headers, None if not found. // Extract the position of the primary key in the current headers, None if not found.
let external_id_pos = match primary_key { let primary_key_pos = match self.index.primary_key(self.rtxn)? {
Some(primary_key) => { Some(primary_key) => {
// Te primary key have is known so we must find the position in the CSV headers. // The primary key is known so we must find the position in the CSV headers.
let name = fields_ids_map.name(primary_key).expect("found the primary key name"); headers.iter().position(|h| h == primary_key)
headers.iter().position(|h| h == name)
}, },
None => headers.iter().position(|h| h.contains("id")), None => headers.iter().position(|h| h.contains("id")),
}; };
// Returns the field id in the fileds ids map, create an "id" field // Returns the field id in the fields ids map, create an "id" field
// in case it is not in the current headers. // in case it is not in the current headers.
let primary_key_field_id = match external_id_pos { let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string());
Some(pos) => fields_ids_map.id(&headers[pos]).expect("found the primary key"), let (primary_key_id, _) = compute_primary_key_pair(
None => { self.index.primary_key(self.rtxn)?,
if !self.autogenerate_docids { &mut fields_ids_map,
// If there is no primary key in the current document batch, we must alternative_name,
// return an error and not automatically generate any document id. self.autogenerate_docids
return Err(anyhow!("missing primary key")) )?;
// The primary key field is not present in the header, so we need to create it.
if primary_key_pos.is_none() {
fields_ids.push((primary_key_id, usize::max_value()));
} }
let field_id = fields_ids_map.insert("id").context("field id limit reached")?;
// We make sure to add the primary key field id to the fields ids,
// this way it is added to the obks.
fields_ids.push((field_id, usize::max_value()));
field_id
},
};
// We sort the fields ids by the fields ids map id, this way we are sure to iterate over // We sort the fields ids by the fields ids map id, this way we are sure to iterate over
// the records fields in the fields ids map order and correctly generate the obkv. // the records fields in the fields ids map order and correctly generate the obkv.
@ -310,7 +287,7 @@ impl Transform<'_, '_> {
} }
// We extract the user id if we know where it is or generate an UUID V4 otherwise. // We extract the user id if we know where it is or generate an UUID V4 otherwise.
let external_id = match external_id_pos { let external_id = match primary_key_pos {
Some(pos) => { Some(pos) => {
let external_id = &record[pos]; let external_id = &record[pos];
// We validate the document id [a-zA-Z0-9\-_]. // We validate the document id [a-zA-Z0-9\-_].
@ -326,7 +303,7 @@ impl Transform<'_, '_> {
// we return the generated document id instead of the record field. // we return the generated document id instead of the record field.
let iter = fields_ids.iter() let iter = fields_ids.iter()
.map(|(fi, i)| { .map(|(fi, i)| {
let field = if *fi == primary_key_field_id { external_id } else { &record[*i] }; let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field) (fi, field)
}); });
@ -349,9 +326,13 @@ impl Transform<'_, '_> {
// Now that we have a valid sorter that contains the user id and the obkv we // Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput. // give it to the last transforming function which returns the TransformOutput.
let primary_key_name = fields_ids_map
.name(primary_key_id)
.map(String::from)
.expect("Primary key must be present in fields id map");
self.output_from_sorter( self.output_from_sorter(
sorter, sorter,
primary_key_field_id, primary_key_name,
fields_ids_map, fields_ids_map,
documents_count, documents_count,
external_documents_ids, external_documents_ids,
@ -365,7 +346,7 @@ impl Transform<'_, '_> {
fn output_from_sorter<F>( fn output_from_sorter<F>(
self, self,
sorter: grenad::Sorter<MergeFn>, sorter: grenad::Sorter<MergeFn>,
primary_key: FieldId, primary_key: String,
fields_ids_map: FieldsIdsMap, fields_ids_map: FieldsIdsMap,
approximate_number_of_documents: usize, approximate_number_of_documents: usize,
mut external_documents_ids: ExternalDocumentsIds<'_>, mut external_documents_ids: ExternalDocumentsIds<'_>,
@ -477,11 +458,11 @@ impl Transform<'_, '_> {
// TODO this can be done in parallel by using the rayon `ThreadPool`. // TODO this can be done in parallel by using the rayon `ThreadPool`.
pub fn remap_index_documents( pub fn remap_index_documents(
self, self,
primary_key: FieldId, primary_key: String,
fields_ids_map: FieldsIdsMap, old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap,
) -> anyhow::Result<TransformOutput> ) -> anyhow::Result<TransformOutput>
{ {
let current_fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
let documents_count = documents_ids.len() as usize; let documents_count = documents_ids.len() as usize;
@ -499,8 +480,8 @@ impl Transform<'_, '_> {
let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer); let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer);
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
for (id, name) in fields_ids_map.iter() { for (id, name) in new_fields_ids_map.iter() {
if let Some(val) = current_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) {
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
} }
} }
@ -516,7 +497,7 @@ impl Transform<'_, '_> {
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map: new_fields_ids_map,
external_documents_ids: external_documents_ids.into_static(), external_documents_ids: external_documents_ids.into_static(),
new_documents_ids: documents_ids, new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(), replaced_documents_ids: RoaringBitmap::default(),
@ -526,6 +507,42 @@ impl Transform<'_, '_> {
} }
} }
/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name)
/// for the primary key according to the following rules:
/// - if primary_key is `Some`, returns the id and the name, else
/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else
/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and
/// returns the pair, else
/// - returns an error.
fn compute_primary_key_pair(
primary_key: Option<&str>,
fields_ids_map: &mut FieldsIdsMap,
alternative_name: Option<String>,
autogenerate_docids: bool,
) -> anyhow::Result<(FieldId, String)> {
match primary_key {
Some(primary_key) => {
let id = fields_ids_map.id(primary_key).expect("primary key must be present in the fields id map");
Ok((id, primary_key.to_string()))
}
None => {
let name = match alternative_name {
Some(key) => key,
None => {
if !autogenerate_docids {
// If there is no primary key in the current document batch, we must
// return an error and not automatically generate any document id.
anyhow::bail!("missing primary key")
}
DEFAULT_PRIMARY_KEY_NAME.to_string()
},
};
let id = fields_ids_map.insert(&name).context("field id limit reached")?;
Ok((id, name))
},
}
}
/// Only the last value associated with an id is kept. /// Only the last value associated with an id is kept.
fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
obkvs.last().context("no last value").map(|last| last.clone().into_owned()) obkvs.last().context("no last value").map(|last| last.clone().into_owned())
@ -552,3 +569,73 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
}) })
}) })
} }
#[cfg(test)]
mod test {
use super::*;
mod compute_primary_key {
use super::compute_primary_key_pair;
use super::FieldsIdsMap;
#[test]
#[should_panic]
fn should_panic_primary_key_not_in_map() {
let mut fields_map = FieldsIdsMap::new();
let _result = compute_primary_key_pair(
Some("toto"),
&mut fields_map,
None,
false);
}
#[test]
fn should_return_primary_key_if_is_some() {
let mut fields_map = FieldsIdsMap::new();
fields_map.insert("toto").unwrap();
let result = compute_primary_key_pair(
Some("toto"),
&mut fields_map,
Some("tata".to_string()),
false);
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_alternative_if_primary_is_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
Some("tata".to_string()),
false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
true);
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_err_if_both_are_none_and_recompute_is_false(){
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
false);
assert!(result.is_err());
assert_eq!(fields_map.len(), 0);
}
}
}

View File

@ -1,14 +1,16 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::str::FromStr; use std::str::FromStr;
use anyhow::{ensure, Context}; use anyhow::Context;
use grenad::CompressionType; use grenad::CompressionType;
use itertools::Itertools;
use rayon::ThreadPool; use rayon::ThreadPool;
use crate::criterion::Criterion;
use crate::facet::FacetType;
use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::index_documents::{Transform, IndexDocumentsMethod};
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::facet::FacetType; use crate::{Index, FieldsIdsMap};
use crate::{Index, FieldsIdsMap, Criterion};
pub struct Settings<'a, 't, 'u, 'i> { pub struct Settings<'a, 't, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -26,7 +28,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
// however if it is `Some(None)` it means that the user forced a reset of the setting. // however if it is `Some(None)` it means that the user forced a reset of the setting.
searchable_fields: Option<Option<Vec<String>>>, searchable_fields: Option<Option<Vec<String>>>,
displayed_fields: Option<Option<Vec<String>>>, displayed_fields: Option<Option<Vec<String>>>,
faceted_fields: Option<HashMap<String, String>>, faceted_fields: Option<Option<HashMap<String, String>>>,
criteria: Option<Option<Vec<String>>>, criteria: Option<Option<Vec<String>>>,
} }
@ -67,7 +69,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) { pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) {
self.faceted_fields = Some(names_facet_types); self.faceted_fields = Some(Some(names_facet_types));
}
pub fn reset_faceted_fields(&mut self) {
self.faceted_fields = Some(None);
} }
pub fn reset_criteria(&mut self) { pub fn reset_criteria(&mut self) {
@ -78,107 +84,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.criteria = Some(Some(criteria)); self.criteria = Some(Some(criteria));
} }
pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()> fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync F: Fn(UpdateIndexingStep) + Sync,
{ {
let mut updated_searchable_fields = None;
let mut updated_faceted_fields = None;
let mut updated_displayed_fields = None;
let mut updated_criteria = None;
// Construct the new FieldsIdsMap based on the searchable fields order.
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut fields_ids_map = match self.searchable_fields { // if the settings are set before any document update, we don't need to do anything, and
Some(Some(searchable_fields)) => { // will set the primary key during the first document addition.
let mut new_fields_ids_map = FieldsIdsMap::new(); if self.index.number_of_documents(&self.wtxn)? == 0 {
let mut new_searchable_fields = Vec::new(); return Ok(())
for name in searchable_fields {
let id = new_fields_ids_map.insert(&name).context("field id limit reached")?;
new_searchable_fields.push(id);
} }
for (_, name) in fields_ids_map.iter() {
new_fields_ids_map.insert(name).context("field id limit reached")?;
}
updated_searchable_fields = Some(Some(new_searchable_fields));
new_fields_ids_map
},
Some(None) => {
updated_searchable_fields = Some(None);
fields_ids_map
},
None => fields_ids_map,
};
// We compute or generate the new primary key field id.
// TODO make the primary key settable.
let primary_key = match self.index.primary_key(&self.wtxn)? {
Some(id) => {
let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let name = current_fields_ids_map.name(id).unwrap();
fields_ids_map.insert(name).context("field id limit reached")?
},
None => fields_ids_map.insert("id").context("field id limit reached")?,
};
let current_faceted_fields = self.index.faceted_fields(self.wtxn)?;
if let Some(fields_names_facet_types) = self.faceted_fields {
let mut faceted_fields = HashMap::new();
for (name, sftype) in fields_names_facet_types {
let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?;
let id = fields_ids_map.insert(&name).context("field id limit reached")?;
match current_faceted_fields.get(&id) {
Some(pftype) => {
ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype);
faceted_fields.insert(id, ftype)
},
None => faceted_fields.insert(id, ftype),
};
}
updated_faceted_fields = Some(faceted_fields);
}
// Check that the displayed attributes have been specified.
if let Some(value) = self.displayed_fields {
match value {
Some(names) => {
let mut new_displayed_fields = Vec::new();
for name in names {
let id = fields_ids_map.insert(&name).context("field id limit reached")?;
new_displayed_fields.push(id);
}
updated_displayed_fields = Some(Some(new_displayed_fields));
}
None => updated_displayed_fields = Some(None),
}
}
if let Some(criteria) = self.criteria {
match criteria {
Some(criteria_names) => {
let mut new_criteria = Vec::new();
for name in criteria_names {
let criterion = Criterion::from_str(&mut fields_ids_map, &name)?;
if let Some(fid) = criterion.field_id() {
let name = fields_ids_map.name(fid).unwrap();
let faceted_fields = updated_faceted_fields.as_ref().unwrap_or(&current_faceted_fields);
ensure!(faceted_fields.contains_key(&fid), "criterion field {} must be faceted", name);
}
new_criteria.push(criterion);
}
updated_criteria = Some(Some(new_criteria));
},
None => updated_criteria = Some(None),
}
}
// If any setting have modified any of the datastructures it means that we need
// to retrieve the documents and then reindex then with the new settings.
if updated_searchable_fields.is_some() || updated_faceted_fields.is_some() {
let transform = Transform { let transform = Transform {
rtxn: &self.wtxn, rtxn: &self.wtxn,
index: self.index, index: self.index,
@ -192,26 +108,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
autogenerate_docids: false, autogenerate_docids: false,
}; };
// There already has been a document addition, the primary key should be set by now.
let primary_key = self.index.primary_key(&self.wtxn)?.context("Index must have a primary key")?;
// We remap the documents fields based on the new `FieldsIdsMap`. // We remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; let output = transform.remap_index_documents(
primary_key.to_string(),
// We write the new FieldsIdsMap to the database old_fields_ids_map,
// this way next indexing methods will be based on that. fields_ids_map.clone())?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
if let Some(faceted_fields) = updated_faceted_fields {
// We write the faceted_fields fields into the database here.
self.index.put_faceted_fields(self.wtxn, &faceted_fields)?;
}
if let Some(searchable_fields) = updated_searchable_fields {
// The new searchable fields are also written down to make sure
// that the IndexDocuments system takes only these ones into account.
match searchable_fields {
Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?,
None => self.index.delete_searchable_fields(self.wtxn).map(drop)?,
}
}
// We clear the full database (words-fst, documents ids and documents content). // We clear the full database (words-fst, documents ids and documents content).
ClearDocuments::new(self.wtxn, self.index).execute()?; ClearDocuments::new(self.wtxn, self.index).execute()?;
@ -227,23 +131,127 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.chunk_compression_level = self.chunk_compression_level;
indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
indexing_builder.thread_pool = self.thread_pool; indexing_builder.thread_pool = self.thread_pool;
indexing_builder.execute_raw(output, &progress_callback)?; indexing_builder.execute_raw(output, &cb)?;
Ok(())
} }
if let Some(displayed_fields) = updated_displayed_fields { fn update_displayed(&mut self) -> anyhow::Result<bool> {
match displayed_fields { match self.displayed_fields {
Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, Some(Some(ref fields)) => {
None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account
let names: Vec<_> = fields
.iter()
.unique()
.map(String::as_str)
.collect();
for name in names.iter() {
fields_ids_map
.insert(name)
.context("field id limit exceeded")?;
} }
self.index.put_displayed_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Some(None) => { self.index.delete_displayed_fields(self.wtxn)?; },
None => return Ok(false),
}
Ok(true)
} }
if let Some(criteria) = updated_criteria { /// Udpates the index's searchable attributes. This causes the field map to be recomputed to
match criteria { /// reflect the order of the searchable attributes.
Some(criteria) => self.index.put_criteria(self.wtxn, &criteria)?, fn update_searchable(&mut self) -> anyhow::Result<bool> {
None => self.index.delete_criteria(self.wtxn).map(drop)?, match self.searchable_fields {
} Some(Some(ref fields)) => {
// every time the searchable attributes are updated, we need to update the
// ids for any settings that uses the facets. (displayed_fields,
// faceted_fields)
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_fields_ids_map = FieldsIdsMap::new();
// fields are deduplicated, only the first occurrence is taken into account
let names = fields
.iter()
.unique()
.map(String::as_str)
.collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one
for name in names.iter() {
new_fields_ids_map
.insert(&name)
.context("field id limit exceeded")?;
} }
for (_, name) in old_fields_ids_map.iter() {
new_fields_ids_map
.insert(&name)
.context("field id limit exceeded")?;
}
self.index.put_searchable_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
}
Some(None) => { self.index.delete_searchable_fields(self.wtxn)?; },
None => return Ok(false),
}
Ok(true)
}
fn update_facets(&mut self) -> anyhow::Result<bool> {
match self.faceted_fields {
Some(Some(ref fields)) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut new_facets = HashMap::new();
for (name, ty) in fields {
fields_ids_map.insert(name).context("field id limit exceeded")?;
let ty = FacetType::from_str(&ty)?;
new_facets.insert(name.clone(), ty);
}
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Some(None) => { self.index.delete_faceted_fields(self.wtxn)?; },
None => return Ok(false)
}
Ok(true)
}
fn update_criteria(&mut self) -> anyhow::Result<()> {
match self.criteria {
Some(Some(ref fields)) => {
let faceted_fields = self.index.faceted_fields(&self.wtxn)?;
let mut new_criteria = Vec::new();
for name in fields {
let criterion = Criterion::from_str(&faceted_fields, &name)?;
new_criteria.push(criterion);
}
self.index.put_criteria(self.wtxn, &new_criteria)?;
}
Some(None) => { self.index.delete_criteria(self.wtxn)?; }
None => (),
}
Ok(())
}
pub fn execute<F>(mut self, progress_callback: F) -> anyhow::Result<()>
where
F: Fn(UpdateIndexingStep) + Sync
{
let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?;
self.update_displayed()?;
let facets_updated = self.update_facets()?;
// update_criteria MUST be called after update_facets, since criterion fields must be set
// as facets.
self.update_criteria()?;
let searchable_updated = self.update_searchable()?;
if facets_updated || searchable_updated {
self.reindex(&progress_callback, old_fields_ids_map)?;
}
Ok(()) Ok(())
} }
} }
@ -251,10 +259,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashmap; use maplit::hashmap;
use crate::facet::FacetType;
use crate::update::{IndexDocuments, UpdateFormat};
#[test] #[test]
fn set_and_reset_searchable_fields() { fn set_and_reset_searchable_fields() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();
@ -336,10 +347,8 @@ mod tests {
// Check that the displayed fields are correctly set to `None` (default value). // Check that the displayed fields are correctly set to `None` (default value).
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let fields_ids = index.displayed_fields(&rtxn).unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap();
let age_id = fields_ids_map.id("age").unwrap(); assert_eq!(fields_ids.unwrap(), (&["age"][..]));
assert_eq!(fields_ids, Some(&[age_id][..]));
drop(rtxn); drop(rtxn);
// We change the searchable fields to be the "name" field only. // We change the searchable fields to be the "name" field only.
@ -351,10 +360,8 @@ mod tests {
// Check that the displayed fields always contains only the "age" field. // Check that the displayed fields always contains only the "age" field.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let fields_ids = index.displayed_fields(&rtxn).unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap();
let age_id = fields_ids_map.id("age").unwrap(); assert_eq!(fields_ids.unwrap(), &["age"][..]);
assert_eq!(fields_ids, Some(&[age_id][..]));
drop(rtxn); drop(rtxn);
} }
@ -402,10 +409,8 @@ mod tests {
// Check that the displayed fields are correctly set to only the "age" field. // Check that the displayed fields are correctly set to only the "age" field.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let age_field_id = fields_ids_map.id("age").unwrap();
let fields_ids = index.displayed_fields(&rtxn).unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap();
assert_eq!(fields_ids.unwrap(), &[age_field_id][..]); assert_eq!(fields_ids.unwrap(), &["age"][..]);
drop(rtxn); drop(rtxn);
// We reset the fields ids to become `None`, the default value. // We reset the fields ids to become `None`, the default value.
@ -445,9 +450,9 @@ mod tests {
// Check that the displayed fields are correctly set. // Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_ids = index.faceted_fields(&rtxn).unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Integer });
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 3); assert_eq!(count, 3);
drop(rtxn); drop(rtxn);
@ -461,8 +466,49 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values. // Only count the field_id 0 and level 0 facet values.
let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
assert_eq!(count, 4); assert_eq!(count, 4);
drop(rtxn); drop(rtxn);
} }
#[test]
fn setting_searchable_recomputes_other_settings() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set all the settings except searchable
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_displayed_fields(vec!["hello".to_string()]);
builder.set_faceted_fields(hashmap!{
"age".into() => "integer".into(),
"toto".into() => "integer".into(),
});
builder.set_criteria(vec!["asc(toto)".to_string()]);
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
// check the output
let rtxn = index.read_txn().unwrap();
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
// since no documents have been pushed the primary key is still unset
assert!(index.primary_key(&rtxn).unwrap().is_none());
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
drop(rtxn);
// We set toto and age as searchable to force reordering of the fields
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index);
builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]);
builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap());
assert!(index.primary_key(&rtxn).unwrap().is_none());
assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap());
drop(rtxn);
}
} }