diff --git a/Cargo.lock b/Cargo.lock index c719f6f03..8e6794fb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,7 +1377,6 @@ dependencies = [ name = "milli" version = "0.3.1" dependencies = [ - "anyhow", "big_s", "bstr", "byteorder", diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 5138de4d2..d5181849f 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -55,15 +55,15 @@ pub fn base_setup(conf: &Conf) -> Index { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); - if let Some(primary_key) = conf.primary_key { - let mut wtxn = index.write_txn().unwrap(); - index.put_primary_key(&mut wtxn, primary_key).unwrap(); - } let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); + if let Some(primary_key) = conf.primary_key { + builder.set_primary_key(primary_key.to_string()); + } + if let Some(criterion) = conf.criterion { builder.reset_filterable_fields(); builder.reset_criteria(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 1f91e6370..e23dddd4c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -391,7 +391,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::ClearDocuments => { @@ -401,7 +401,7 @@ async fn main() -> anyhow::Result<()> { match builder.execute() { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::Settings(settings) => { @@ -471,7 +471,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::Facets(levels) => { @@ -486,7 +486,7 @@ async fn main() -> anyhow::Result<()> { } match builder.execute() { Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } }; diff --git a/infos/src/main.rs b/infos/src/main.rs index d6aa1f854..b0c304de0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,55 +5,41 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::facet::FacetType; -use milli::{Index, TreeLevel}; use structopt::StructOpt; +use milli::facet::FacetType; +use milli::index::db_name::*; +use milli::{Index, TreeLevel}; + use Command::*; #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; -const MAIN_DB_NAME: &str = "main"; -const WORD_DOCIDS_DB_NAME: &str = "word-docids"; -const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; -const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; -const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; -const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; -const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; -const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; -const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; -const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; -const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; -const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; -const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings"; - -const DOCUMENTS_DB_NAME: &str = "documents"; - const ALL_DATABASE_NAMES: &[&str] = &[ - MAIN_DB_NAME, - WORD_DOCIDS_DB_NAME, - WORD_PREFIX_DOCIDS_DB_NAME, - DOCID_WORD_POSITIONS_DB_NAME, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_LEVEL_POSITION_DOCIDS_DB_NAME, - WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, - FACET_ID_F64_DOCIDS_DB_NAME, - FACET_ID_STRING_DOCIDS_DB_NAME, - FIELD_ID_DOCID_FACET_F64S_DB_NAME, - FIELD_ID_DOCID_FACET_STRINGS_DB_NAME, - DOCUMENTS_DB_NAME, + MAIN, + WORD_DOCIDS, + WORD_PREFIX_DOCIDS, + DOCID_WORD_POSITIONS, + WORD_PAIR_PROXIMITY_DOCIDS, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, + WORD_LEVEL_POSITION_DOCIDS, + WORD_PREFIX_LEVEL_POSITION_DOCIDS, + FIELD_ID_WORD_COUNT_DOCIDS, + FACET_ID_F64_DOCIDS, + FACET_ID_STRING_DOCIDS, + FIELD_ID_DOCID_FACET_F64S, + FIELD_ID_DOCID_FACET_STRINGS, + DOCUMENTS, ]; const POSTINGS_DATABASE_NAMES: &[&str] = &[ - WORD_DOCIDS_DB_NAME, - WORD_PREFIX_DOCIDS_DB_NAME, - DOCID_WORD_POSITIONS_DB_NAME, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_DOCIDS, + WORD_PREFIX_DOCIDS, + DOCID_WORD_POSITIONS, + WORD_PAIR_PROXIMITY_DOCIDS, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, ]; #[derive(Debug, StructOpt)] @@ -944,21 +930,21 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN_DB_NAME => &main, - WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), - WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), - WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), - FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), - FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), - FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(), + MAIN => &main, + WORD_PREFIX_DOCIDS => word_prefix_docids.as_polymorph(), + WORD_DOCIDS => word_docids.as_polymorph(), + DOCID_WORD_POSITIONS => docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS => word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => word_prefix_pair_proximity_docids.as_polymorph(), + WORD_LEVEL_POSITION_DOCIDS => word_level_position_docids.as_polymorph(), + WORD_PREFIX_LEVEL_POSITION_DOCIDS => word_prefix_level_position_docids.as_polymorph(), + FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), + FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), + FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), + FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), - DOCUMENTS_DB_NAME => documents.as_polymorph(), + DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; @@ -1039,27 +1025,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu } match name { - WORD_DOCIDS_DB_NAME => { + WORD_DOCIDS => { let db = index.word_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PREFIX_DOCIDS_DB_NAME => { + WORD_PREFIX_DOCIDS => { let db = index.word_prefix_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - DOCID_WORD_POSITIONS_DB_NAME => { + DOCID_WORD_POSITIONS => { let db = index.docid_word_positions.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => { + WORD_PAIR_PROXIMITY_DOCIDS => { let db = index.word_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => { + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { + FIELD_ID_WORD_COUNT_DOCIDS => { let db = index.field_id_word_count_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9fe1ce3d3..ac7a977a2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -5,7 +5,6 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] -anyhow = "1.0.38" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index c2205613d..931cf8588 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,11 +1,12 @@ use std::fmt; use std::str::FromStr; -use anyhow::{Context, bail}; use regex::Regex; use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; +use crate::error::{Error, UserError}; + static ASC_DESC_REGEX: Lazy = Lazy::new(|| { Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() }); @@ -41,7 +42,7 @@ impl Criterion { } impl FromStr for Criterion { - type Err = anyhow::Error; + type Err = Error; fn from_str(txt: &str) -> Result { match txt { @@ -51,13 +52,15 @@ impl FromStr for Criterion { "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), text => { - let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; + let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { + UserError::InvalidCriterionName { name: text.to_string() } + })?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), - otherwise => bail!("unknown criterion name: {}", otherwise), + text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), } }, } diff --git a/milli/src/error.rs b/milli/src/error.rs new file mode 100644 index 000000000..78a1b1c59 --- /dev/null +++ b/milli/src/error.rs @@ -0,0 +1,265 @@ +use std::convert::Infallible; +use std::error::Error as StdError; +use std::{fmt, io, str}; + +use heed::{MdbError, Error as HeedError}; +use rayon::ThreadPoolBuildError; +use serde_json::{Map, Value}; + +use crate::search::ParserRule; +use crate::{DocumentId, FieldId}; + +pub type Object = Map; + +#[derive(Debug)] +pub enum Error { + InternalError(InternalError), + IoError(io::Error), + UserError(UserError), +} + +#[derive(Debug)] +pub enum InternalError { + DatabaseClosing, + DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + FieldIdMapMissingEntry(FieldIdMapMissingEntry), + Fst(fst::Error), + GrenadInvalidCompressionType, + IndexingMergingKeys { process: &'static str }, + InvalidDatabaseTyping, + RayonThreadPool(ThreadPoolBuildError), + SerdeJson(serde_json::Error), + Serialization(SerializationError), + Store(MdbError), + Utf8(str::Utf8Error), +} + +#[derive(Debug)] +pub enum SerializationError { + Decoding { db_name: Option<&'static str> }, + Encoding { db_name: Option<&'static str> }, + InvalidNumberSerialization, +} + +#[derive(Debug)] +pub enum FieldIdMapMissingEntry { + FieldId { field_id: FieldId, process: &'static str }, + FieldName { field_name: String, process: &'static str }, +} + +#[derive(Debug)] +pub enum UserError { + AttributeLimitReached, + Csv(csv::Error), + MaxDatabaseSizeReached, + DocumentLimitReached, + InvalidFilter(pest::error::Error), + InvalidCriterionName { name: String }, + InvalidDocumentId { document_id: Value }, + InvalidFilterAttribute(pest::error::Error), + InvalidStoreFile, + MissingDocumentId { document: Object }, + MissingPrimaryKey, + NoSpaceLeftOnDevice, + PrimaryKeyCannotBeChanged, + PrimaryKeyCannotBeReset, + SerdeJson(serde_json::Error), + UnknownInternalDocumentId { document_id: DocumentId }, +} + +impl From for Error { + fn from(error: io::Error) -> Error { + // TODO must be improved and more precise + Error::IoError(error) + } +} + +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::InternalError(InternalError::Fst(error)) + } +} + +impl From> for Error where Error: From { + fn from(error: grenad::Error) -> Error { + match error { + grenad::Error::Io(error) => Error::IoError(error), + grenad::Error::Merge(error) => Error::from(error), + grenad::Error::InvalidCompressionType => { + Error::InternalError(InternalError::GrenadInvalidCompressionType) + }, + } + } +} + +impl From for Error { + fn from(error: str::Utf8Error) -> Error { + Error::InternalError(InternalError::Utf8(error)) + } +} + +impl From for Error { + fn from(_error: Infallible) -> Error { + unreachable!() + } +} + +impl From for Error { + fn from(error: HeedError) -> Error { + use self::Error::*; + use self::InternalError::*; + use self::SerializationError::*; + use self::UserError::*; + + match error { + HeedError::Io(error) => Error::from(error), + HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached), + HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), + HeedError::Mdb(error) => InternalError(Store(error)), + HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), + HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), + HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), + HeedError::DatabaseClosing => InternalError(DatabaseClosing), + } + } +} + +impl From for Error { + fn from(error: ThreadPoolBuildError) -> Error { + Error::InternalError(InternalError::RayonThreadPool(error)) + } +} + +impl From for Error { + fn from(error: FieldIdMapMissingEntry) -> Error { + Error::InternalError(InternalError::FieldIdMapMissingEntry(error)) + } +} + +impl From for Error { + fn from(error: InternalError) -> Error { + Error::InternalError(error) + } +} + +impl From for Error { + fn from(error: UserError) -> Error { + Error::UserError(error) + } +} + +impl From for Error { + fn from(error: SerializationError) -> Error { + Error::InternalError(InternalError::Serialization(error)) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InternalError(error) => write!(f, "internal: {}", error), + Self::IoError(error) => error.fmt(f), + Self::UserError(error) => error.fmt(f), + } + } +} + +impl StdError for Error {} + +impl fmt::Display for InternalError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::DatabaseMissingEntry { db_name, key } => { + write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) + }, + Self::FieldIdMapMissingEntry(error) => error.fmt(f), + Self::Fst(error) => error.fmt(f), + Self::GrenadInvalidCompressionType => { + f.write_str("invalid compression type have been specified to grenad") + }, + Self::IndexingMergingKeys { process } => { + write!(f, "invalid merge while processing {}", process) + }, + Self::Serialization(error) => error.fmt(f), + Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), + Self::RayonThreadPool(error) => error.fmt(f), + Self::SerdeJson(error) => error.fmt(f), + Self::DatabaseClosing => HeedError::DatabaseClosing.fmt(f), + Self::Store(error) => error.fmt(f), + Self::Utf8(error) => error.fmt(f), + } + } +} + +impl StdError for InternalError {} + +impl fmt::Display for UserError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::Csv(error) => error.fmt(f), + Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::InvalidFilter(error) => error.fmt(f), + Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidDocumentId { document_id } => { + let json = serde_json::to_string(document_id).unwrap(); + write!(f, "document identifier is invalid {}", json) + }, + Self::InvalidFilterAttribute(error) => error.fmt(f), + Self::MissingDocumentId { document } => { + let json = serde_json::to_string(document).unwrap(); + write!(f, "document doesn't have an identifier {}", json) + }, + Self::MissingPrimaryKey => f.write_str("missing primary key"), + Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), + // TODO where can we find it instead of writing the text ourselves? + Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), + Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), + Self::PrimaryKeyCannotBeChanged => { + f.write_str("primary key cannot be changed if the database contains documents") + }, + Self::PrimaryKeyCannotBeReset => { + f.write_str("primary key cannot be reset if the database contains documents") + }, + Self::SerdeJson(error) => error.fmt(f), + Self::UnknownInternalDocumentId { document_id } => { + write!(f, "an unknown internal document id have been used ({})", document_id) + }, + } + } +} + +impl StdError for UserError {} + +impl fmt::Display for FieldIdMapMissingEntry { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::FieldId { field_id, process } => { + write!(f, "unknown field id {} coming from the {} process", field_id, process) + }, + Self::FieldName { field_name, process } => { + write!(f, "unknown field name {} coming from the {} process", field_name, process) + }, + } + } +} + +impl StdError for FieldIdMapMissingEntry {} + +impl fmt::Display for SerializationError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Decoding { db_name: Some(name) } => { + write!(f, "decoding from the {} database failed", name) + }, + Self::Decoding { db_name: None } => f.write_str("decoding failed"), + Self::Encoding { db_name: Some(name) } => { + write!(f, "encoding into the {} database failed", name) + }, + Self::Encoding { db_name: None } => f.write_str("encoding failed"), + Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), + } + } +} + +impl StdError for SerializationError {} diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs index 350efc450..259dab972 100644 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ b/milli/src/heed_codec/facet/facet_value_string_codec.rs @@ -5,6 +5,14 @@ use crate::FieldId; pub struct FacetValueStringCodec; +impl FacetValueStringCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 1); + out.push(field_id); + out.extend_from_slice(value.as_bytes()); + } +} + impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { type DItem = (FieldId, &'a str); @@ -19,9 +27,8 @@ impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { type EItem = (FieldId, &'a str); fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(value.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(value.as_bytes()); + let mut bytes = Vec::new(); + FacetValueStringCodec::serialize_into(*field_id, value, &mut bytes); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 2e282b2a0..b002346e9 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -6,6 +6,15 @@ use crate::{FieldId, DocumentId}; pub struct FieldDocIdFacetStringCodec; +impl FieldDocIdFacetStringCodec { + pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec) { + out.reserve(1 + 4 + value.len()); + out.push(field_id); + out.extend_from_slice(&document_id.to_be_bytes()); + out.extend_from_slice(value.as_bytes()); + } +} + impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { type DItem = (FieldId, DocumentId, &'a str); @@ -22,10 +31,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(1 + 4 + value.len()); - bytes.push(*field_id); - bytes.extend_from_slice(&document_id.to_be_bytes()); - bytes.extend_from_slice(value.as_bytes()); + let mut bytes = Vec::new(); + FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 8d1eb79dd..994e23b39 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -6,6 +6,13 @@ use roaring::RoaringBitmap; pub struct BoRoaringBitmapCodec; +impl BoRoaringBitmapCodec { + pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec) { + out.reserve(bitmap.len() as usize * size_of::()); + bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes)); + } +} + impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { type DItem = RoaringBitmap; @@ -25,12 +32,8 @@ impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { type EItem = RoaringBitmap; fn bytes_encode(item: &Self::EItem) -> Option> { - let mut out = Vec::with_capacity(item.len() as usize * size_of::()); - - item.iter() - .map(|i| i.to_ne_bytes()) - .for_each(|bytes| out.extend_from_slice(&bytes)); - + let mut out = Vec::new(); + BoRoaringBitmapCodec::serialize_into(item, &mut out); Some(Cow::Owned(out)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 8ccf831e3..325effa73 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -23,18 +23,17 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) -> io::Result<()> { + pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { if roaring.len() <= THRESHOLD as u64 { // If the number of items (u32s) to encode is less than or equal to the threshold // it means that it would weigh the same or less than the RoaringBitmap // header, so we directly encode them using ByteOrder instead. for integer in roaring { - vec.write_u32::(integer)?; + vec.write_u32::(integer).unwrap(); } - Ok(()) } else { // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec) + roaring.serialize_into(vec).unwrap(); } } @@ -68,7 +67,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Option> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec).ok()?; + Self::serialize_into(item, &mut vec); Some(Cow::Owned(vec)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 4e32f673a..bf4b3e023 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2,14 +2,14 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::path::Path; -use anyhow::Context; use chrono::{DateTime, Utc}; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; use roaring::RoaringBitmap; +use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; -use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, @@ -21,25 +21,44 @@ use crate::heed_codec::facet::{ }; use crate::fields_ids_map::FieldsIdsMap; -pub const CRITERIA_KEY: &str = "criteria"; -pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; -pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; -pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; -pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; -pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; -pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; -pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; -pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; -pub const PRIMARY_KEY_KEY: &str = "primary-key"; -pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; -pub const STOP_WORDS_KEY: &str = "stop-words"; -pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; -pub const SYNONYMS_KEY: &str = "synonyms"; -pub const WORDS_FST_KEY: &str = "words-fst"; -pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; -const CREATED_AT_KEY: &str = "created-at"; -const UPDATED_AT_KEY: &str = "updated-at"; +pub mod main_key { + pub const CRITERIA_KEY: &str = "criteria"; + pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; + pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; + pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; + pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; + pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; + pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; + pub const PRIMARY_KEY_KEY: &str = "primary-key"; + pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; + pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; + pub const STOP_WORDS_KEY: &str = "stop-words"; + pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; + pub const SYNONYMS_KEY: &str = "synonyms"; + pub const WORDS_FST_KEY: &str = "words-fst"; + pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; + pub const CREATED_AT_KEY: &str = "created-at"; + pub const UPDATED_AT_KEY: &str = "updated-at"; +} + +pub mod db_name { + pub const MAIN: &str = "main"; + pub const WORD_DOCIDS: &str = "word-docids"; + pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; + pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; + pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const WORD_LEVEL_POSITION_DOCIDS: &str = "word-level-position-docids"; + pub const WORD_PREFIX_LEVEL_POSITION_DOCIDS: &str = "word-prefix-level-position-docids"; + pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; + pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; + pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; + pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; + pub const DOCUMENTS: &str = "documents"; +} #[derive(Clone)] pub struct Index { @@ -84,24 +103,26 @@ pub struct Index { } impl Index { - pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { + pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { + use db_name::*; + options.max_dbs(14); let env = options.open(path)?; - let main = env.create_poly_database(Some("main"))?; - let word_docids = env.create_database(Some("word-docids"))?; - let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?; - let docid_word_positions = env.create_database(Some("docid-word-positions"))?; - let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; - let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; - let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; - let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?; - let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; - let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; - let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; - let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?; - let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?; - let documents = env.create_database(Some("documents"))?; + let main = env.create_poly_database(Some(MAIN))?; + let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; + let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; + let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; + let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; + let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; + let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let documents = env.create_database(Some(DOCUMENTS))?; Index::initialize_creation_dates(&env, main)?; @@ -127,10 +148,10 @@ impl Index { fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { + if main.get::<_, Str, SerdeJson>>(&txn, main_key::CREATED_AT_KEY)?.is_none() { let now = Utc::now(); - main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; - main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, main_key::UPDATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, main_key::CREATED_AT_KEY, &now)?; txn.commit()?; } Ok(()) @@ -163,43 +184,43 @@ impl Index { /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. - pub fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, DOCUMENTS_IDS_KEY, docids) + pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } /// Returns the internal documents ids. pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) } /// Returns the number of documents indexed in the database. - pub fn number_of_documents(&self, rtxn: &RoTxn) -> anyhow::Result { - let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, DOCUMENTS_IDS_KEY)?; + pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { + let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; Ok(count.unwrap_or_default()) } /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. - pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { + pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &Utc::now())?; - self.main.put::<_, Str, Str>(wtxn, PRIMARY_KEY_KEY, &primary_key) + self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. pub fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, PRIMARY_KEY_KEY) + self.main.delete::<_, Str>(wtxn, main_key::PRIMARY_KEY_KEY) } /// Returns the documents primary key, `None` if it hasn't been defined. pub fn primary_key<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, PRIMARY_KEY_KEY) + self.main.get::<_, Str, Str>(rtxn, main_key::PRIMARY_KEY_KEY) } /* external documents ids */ /// Writes the external documents ids and internal ids (i.e. `u32`). - pub fn put_external_documents_ids<'a>( + pub(crate) fn put_external_documents_ids<'a>( &self, wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, @@ -208,16 +229,16 @@ impl Index { let ExternalDocumentsIds { hard, soft } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>(wtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; - self.main.put::<_, Str, ByteSlice>(wtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; Ok(()) } /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result> { - let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { + let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard = match hard { Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, @@ -233,93 +254,112 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. - pub fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_IDS_MAP_KEY, map) + pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } /// Returns the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson>( + rtxn, + main_key::FIELDS_IDS_MAP_KEY, + )?.unwrap_or_default()) } /* fields distribution */ /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution) + pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) } /// Returns the fields distribution which associates every field name with /// the number of times it occurs in the documents. pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson>( + rtxn, + main_key::FIELDS_DISTRIBUTION_KEY, + )?.unwrap_or_default()) } /* displayed fields */ /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, DISPLAYED_FIELDS_KEY, &fields) + pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) } /// Deletes the displayed fields ids, this will make the engine to display /// all the documents attributes in the order of the `FieldsIdsMap`. pub fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISPLAYED_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::DISPLAYED_FIELDS_KEY) } /// Returns the displayed fields in the order they were set by the user. If it returns /// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`. pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, DISPLAYED_FIELDS_KEY) + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::DISPLAYED_FIELDS_KEY) } - pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> heed::Result>> { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let ids = self.displayed_fields(rtxn)? - .map(|fields| fields - .into_iter() - .map(|name| fields_ids_map.id(name).expect("Field not found")) - .collect::>()); - Ok(ids) + /// Identical to `displayed_fields`, but returns the ids instead. + pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> Result>> { + match self.displayed_fields(rtxn)? { + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields.into_iter() { + match fields_ids_map.id(name) { + Some(field_id) => fields_ids.push(field_id), + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::displayed_fields_ids", + }.into()), + } + } + Ok(Some(fields_ids)) + }, + None => Ok(None), + } } /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, SEARCHABLE_FIELDS_KEY, &fields) + pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. pub fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, SEARCHABLE_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Returns the searchable fields, those are the fields that are indexed, /// if the searchable fields aren't there it means that **all** the fields are indexed. pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, SEARCHABLE_FIELDS_KEY) + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result>> { + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { match self.searchable_fields(rtxn)? { - Some(names) => { - let fields_map = self.fields_ids_map(rtxn)?; - let mut ids = Vec::new(); - for name in names { - let id = fields_map - .id(name) - .ok_or_else(|| format!("field id map must contain {:?}", name)) - .expect("corrupted data: "); - ids.push(id); + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + match fields_ids_map.id(name) { + Some(field_id) => fields_ids.push(field_id), + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::searchable_fields_ids", + }.into()), + } } - Ok(Some(ids)) - } + Ok(Some(fields_ids)) + }, None => Ok(None), } } @@ -327,35 +367,42 @@ impl Index { /* filterable fields */ /// Writes the filterable fields names in the database. - pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, FILTERABLE_FIELDS_KEY, fields) + pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } /// Deletes the filterable fields ids in the database. pub fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, FILTERABLE_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::FILTERABLE_FIELDS_KEY) } /// Returns the filterable fields names. pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FILTERABLE_FIELDS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson<_>>( + rtxn, + main_key::FILTERABLE_FIELDS_KEY, + )?.unwrap_or_default()) } - /// Same as `filterable_fields`, but returns ids instead. - pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { - let filterable_fields = self.filterable_fields(rtxn)?; + /// Identical to `filterable_fields`, but returns ids instead. + pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.filterable_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; - let filterable_fields = filterable_fields - .iter() - .map(|k| { - fields_ids_map - .id(k) - .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: ") - }) - .collect(); - Ok(filterable_fields) + let mut fields_ids = HashSet::new(); + for name in fields { + match fields_ids_map.id(&name) { + Some(field_id) => { + fields_ids.insert(field_id); + }, + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::filterable_fields_ids", + }.into()), + } + } + + Ok(fields_ids) } /* faceted documents ids */ @@ -363,7 +410,7 @@ impl Index { /// Returns the faceted fields names. /// /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. - pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)? @@ -382,36 +429,40 @@ impl Index { Ok(faceted_fields) } - /// Same as `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { - let faceted_fields = self.faceted_fields(rtxn)?; + /// Identical to `faceted_fields`, but returns ids instead. + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; - let faceted_fields = faceted_fields - .iter() - .map(|k| { - fields_ids_map - .id(k) - .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: ") - }) - .collect(); - Ok(faceted_fields) + let mut fields_ids = HashSet::new(); + for name in fields.into_iter() { + match fields_ids_map.id(&name) { + Some(field_id) => { + fields_ids.insert(field_id); + }, + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::faceted_fields_ids", + }.into()), + } + } + + Ok(fields_ids) } /* faceted documents ids */ /// Writes the documents ids that are faceted with numbers under this field id. - pub fn put_number_faceted_documents_ids( + pub(crate) fn put_number_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -423,9 +474,9 @@ impl Index { field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), @@ -434,16 +485,16 @@ impl Index { } /// Writes the documents ids that are faceted with strings under this field id. - pub fn put_string_faceted_documents_ids( + pub(crate) fn put_string_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -455,9 +506,9 @@ impl Index { field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), @@ -468,29 +519,29 @@ impl Index { /* distinct field */ pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_FIELD_KEY, distinct_field) + self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) } pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_FIELD_KEY) + self.main.get::<_, Str, Str>(rtxn, main_key::DISTINCT_FIELD_KEY) } pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_FIELD_KEY) + self.main.delete::<_, Str>(wtxn, main_key::DISTINCT_FIELD_KEY) } /* criteria */ - pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, CRITERIA_KEY, &criteria) + pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } pub fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, CRITERIA_KEY) + self.main.delete::<_, Str>(wtxn, main_key::CRITERIA_KEY) } pub fn criteria(&self, rtxn: &RoTxn) -> heed::Result> { - match self.main.get::<_, Str, SerdeJson>>(rtxn, CRITERIA_KEY)? { + match self.main.get::<_, Str, SerdeJson>>(rtxn, main_key::CRITERIA_KEY)? { Some(criteria) => Ok(criteria), None => Ok(default_criteria()), } @@ -499,13 +550,13 @@ impl Index { /* words fst */ /// Writes the FST which is the words dictionary of the engine. - pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes()) + pub(crate) fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } /// Returns the FST which is the words dictionary of the engine. - pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { + pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -513,16 +564,16 @@ impl Index { /* stop words */ - pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes()) + pub(crate) fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::STOP_WORDS_KEY) } - pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), None => Ok(None), } @@ -530,19 +581,29 @@ impl Index { /* synonyms */ - pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms) + pub(crate) fn put_synonyms( + &self, + wtxn: &mut RwTxn, + synonyms: &HashMap, Vec>>, + ) -> heed::Result<()> + { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) } pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY) } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { - Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) } - pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result>>> { + pub fn words_synonyms>( + &self, + rtxn: &RoTxn, + words: &[S], + ) -> heed::Result>>> + { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms(rtxn)?.remove(&words)) } @@ -550,13 +611,13 @@ impl Index { /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. - pub fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) + pub(crate) fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) } /// Returns the FST which is the words prefixes dictionnary of the engine. - pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_PREFIXES_FST_KEY)? { + pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -577,13 +638,13 @@ impl Index { &self, rtxn: &'t RoTxn, ids: impl IntoIterator, - ) -> anyhow::Result)>> + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { let kv = self.documents.get(rtxn, &BEU32::new(id))? - .with_context(|| format!("Could not find document {}", id))?; + .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -594,7 +655,7 @@ impl Index { pub fn all_documents<'t>( &self, rtxn: &'t RoTxn, - ) -> anyhow::Result)>>> { + ) -> Result)>>> { Ok(self .documents .iter(rtxn)? @@ -611,23 +672,27 @@ impl Index { } /// Returns the index creation time. - pub fn created_at(&self, rtxn: &RoTxn) -> heed::Result> { - let time = self.main - .get::<_, Str, SerdeJson>>(rtxn, CREATED_AT_KEY)? - .expect("Index without creation time"); - Ok(time) + pub fn created_at(&self, rtxn: &RoTxn) -> Result> { + Ok(self.main + .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::CREATED_AT_KEY), + })?) } /// Returns the index last updated time. - pub fn updated_at(&self, rtxn: &RoTxn) -> heed::Result> { - let time = self.main - .get::<_, Str, SerdeJson>>(rtxn, UPDATED_AT_KEY)? - .expect("Index without update time"); - Ok(time) + pub fn updated_at(&self, rtxn: &RoTxn) -> Result> { + Ok(self.main + .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::UPDATED_AT_KEY), + })?) } pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) + self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8c1ed514c..f37244114 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,7 @@ #[macro_use] extern crate pest_derive; mod criterion; +mod error; mod external_documents_ids; mod fields_ids_map; mod search; @@ -14,12 +15,13 @@ pub mod update; use std::borrow::Cow; use std::collections::HashMap; use std::hash::BuildHasherDefault; +use std::result::Result as StdResult; -use anyhow::Context; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; +pub use self::error::Error; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; @@ -29,6 +31,8 @@ pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; pub use self::tree_level::TreeLevel; +pub type Result = std::result::Result; + pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; @@ -43,21 +47,24 @@ pub type FieldId = u8; pub type Position = u32; pub type FieldsDistribution = HashMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result>; +type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, -) -> anyhow::Result> +) -> Result> { displayed_fields.iter() .copied() .flat_map(|id| obkv.get(id).map(|value| (id, value))) .map(|(id, value)| { - let name = fields_ids_map.name(id).context("unknown obkv field id")?; - let value = serde_json::from_slice(value)?; + let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId { + field_id: id, + process: "obkv_to_json", + })?; + let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?; Ok((name.to_owned(), value)) }) .collect() diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index f90f3e421..95f77fd78 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,15 +1,15 @@ use std::mem::take; -use anyhow::Context; use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldId, Index}; +use crate::{FieldId, Index, Result}; use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make @@ -36,7 +36,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result { + ) -> Result { Self::new(index, rtxn, parent, field_name, true) } @@ -45,7 +45,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result { + ) -> Result { Self::new(index, rtxn, parent, field_name, false) } @@ -55,11 +55,14 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, ascending: bool, - ) -> anyhow::Result { + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map .id(&field_name) - .with_context(|| format!("field {:?} isn't registered", field_name))?; + .ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field_name.clone(), + process: "AscDesc::new", + })?; Ok(AscDesc { index, @@ -79,7 +82,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. self.allowed_candidates -= params.excluded_candidates; @@ -162,7 +165,7 @@ fn facet_ordered<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result> + 't>> { +) -> Result> + 't>> { if candidates.len() <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) @@ -186,7 +189,7 @@ fn iterative_facet_ordered_iter<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result + 't> { +) -> Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index f825623f6..f191defe1 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -5,7 +5,7 @@ use std::mem::take; use roaring::RoaringBitmap; -use crate::{TreeLevel, search::build_dfa}; +use crate::{TreeLevel, Result, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; @@ -48,7 +48,7 @@ impl<'t> Attribute<'t> { impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some((_, _, allowed_candidates)) = self.state.as_mut() { *allowed_candidates -= params.excluded_candidates; @@ -224,7 +224,12 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn new( + ctx: &'t dyn Context<'t>, + queries: &'q [Query], + wdcache: &mut WordDerivationsCache, + ) -> Result> + { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { @@ -471,7 +476,7 @@ fn initialize_query_level_iterators<'t, 'q>( branches: &'q FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result>> { +) -> Result>> { let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { @@ -521,7 +526,7 @@ fn set_compute_candidates<'t>( branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result> +) -> Result> { let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); @@ -573,7 +578,7 @@ fn linear_compute_candidates( ctx: &dyn Context, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, -) -> anyhow::Result> +) -> Result> { fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 4d9e54f6e..eb44b7b8e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -14,7 +14,7 @@ use crate::search::criteria::{ CriterionResult, resolve_query_tree, }; -use crate::TreeLevel; +use crate::{TreeLevel, Result}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -45,7 +45,7 @@ impl<'t> Exactness<'t> { impl<'t> Criterion for Exactness<'t> { #[logging_timer::time("Exactness::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some(state) = self.state.as_mut() { state.difference_with(params.excluded_candidates); @@ -158,7 +158,7 @@ fn resolve_state( ctx: &dyn Context, state: State, query: &[ExactQueryPart], -) -> anyhow::Result<(RoaringBitmap, Option)> +) -> Result<(RoaringBitmap, Option)> { use State::*; match state { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 860362f51..645a3a5d7 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,6 +1,7 @@ use log::debug; use roaring::RoaringBitmap; +use crate::Result; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; @@ -29,7 +30,7 @@ impl<'t> Final<'t> { } #[logging_timer::time("Final::{}")] - pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { + pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result> { debug!("Final iteration"); let excluded_candidates = &self.returned_candidates | excluded_candidates; let mut criterion_parameters = CriterionParameters { diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 5d242a0eb..e6d0a17f7 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; +use crate::Result; use crate::search::query_tree::Operation; - use super::{Criterion, CriterionResult, CriterionParameters}; pub struct Initial { @@ -22,7 +22,7 @@ impl Initial { impl Criterion for Initial { #[logging_timer::time("Initial::{}")] - fn next(&mut self, _: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, _: &mut CriterionParameters) -> Result> { Ok(self.answer.take()) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b14d75ddb..48af0b8aa 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use roaring::RoaringBitmap; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; -use crate::{Index, DocumentId}; +use crate::{Index, DocumentId, Result}; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; @@ -26,7 +26,7 @@ mod words; pub mod r#final; pub trait Criterion { - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result>; + fn next(&mut self, params: &mut CriterionParameters) -> Result>; } /// The result of a call to the parent criterion. @@ -76,10 +76,11 @@ pub trait Context<'c> { fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> heed::Result>; + fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result>; } + pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, @@ -173,7 +174,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.words_synonyms(self.rtxn, &[word]) } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> Result> { match self.index.searchable_fields_ids(self.rtxn)? { Some(searchable_fields_ids) => Ok(searchable_fields_ids), None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), @@ -185,14 +186,14 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result> { let key = (word, level, left, right); self.index.word_level_position_docids.get(self.rtxn, &key) } } impl<'t> CriteriaBuilder<'t> { - pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> Result { let words_fst = index.words_fst(rtxn)?; let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) @@ -203,7 +204,7 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - ) -> anyhow::Result> + ) -> Result> { use crate::criterion::Criterion as Name; @@ -230,13 +231,13 @@ pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { use Operation::{And, Phrase, Or, Query}; @@ -244,7 +245,7 @@ pub fn resolve_query_tree<'t>( And(ops) => { let mut ops = ops.iter().map(|op| { resolve_operation(ctx, op, wdcache) - }).collect::>>()?; + }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -302,7 +303,7 @@ fn all_word_pair_proximity_docids, U: AsRef>( left_words: &[(T, u8)], right_words: &[(U, u8)], proximity: u8 -) -> anyhow::Result +) -> Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { @@ -318,7 +319,7 @@ fn query_docids( ctx: &dyn Context, query: &Query, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { match &query.kind { QueryKind::Exact { word, .. } => { @@ -354,7 +355,7 @@ fn query_pair_proximity_docids( right: &Query, proximity: u8, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { if proximity >= 8 { let mut candidates = query_docids(ctx, left, wdcache)?; @@ -477,11 +478,11 @@ pub mod test { todo!() } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> Result> { todo!() } - fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> Result, heed::Error> { + fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result> { todo!() } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 4da6fd1eb..c3c8027cb 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -5,9 +5,10 @@ use std::mem::take; use roaring::RoaringBitmap; use log::debug; -use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; +use crate::search::{query_tree::QueryKind}; +use crate::{DocumentId, Position, Result}; use super::{ Context, Criterion, @@ -55,7 +56,7 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some((_, _, allowed_candidates)) = self.state.as_mut() { *allowed_candidates -= params.excluded_candidates; @@ -161,7 +162,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, @@ -169,7 +170,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { use Operation::{And, Phrase, Or}; @@ -227,7 +228,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { (0..=mana.min(left_max)).map(move |m| (m, mana - m)) @@ -281,7 +282,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { // Extract the first two elements but gives the tail // that is just after the first element. @@ -324,13 +325,13 @@ fn resolve_plane_sweep_candidates( query_tree: &Operation, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result> +) -> Result> { /// FIXME may be buggy with query like "new new york" fn plane_sweep( groups_positions: Vec>, consecutive: bool, - ) -> anyhow::Result> + ) -> Result> { fn compute_groups_proximity( groups: &[(usize, (Position, u8, Position))], @@ -451,7 +452,7 @@ fn resolve_plane_sweep_candidates( rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, words_positions: &HashMap, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { use Operation::{And, Phrase, Or}; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index d075b6bca..436f4affd 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,6 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; +use crate::Result; use super::{ Candidates, Context, @@ -43,7 +44,7 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { use Candidates::{Allowed, Forbidden}; // remove excluded candidates when next is called, instead of doing it in the loop. match self.state.as_mut() { @@ -163,14 +164,14 @@ fn alterate_query_tree( mut query_tree: Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result<()> + ) -> Result<()> { use Operation::{And, Phrase, Or}; @@ -218,7 +219,7 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, @@ -226,7 +227,7 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { use Operation::{And, Phrase, Or, Query}; @@ -277,7 +278,7 @@ fn resolve_candidates<'t>( mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { match branches.split_first() { Some((head, [])) => { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 8730fa331..add90d80d 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -4,6 +4,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; +use crate::Result; use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; pub struct Words<'t> { @@ -30,7 +31,7 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some(candidates) = self.candidates.as_mut() { *candidates -= params.excluded_candidates; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index de7b28141..b9ffd9d90 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -3,9 +3,11 @@ use std::mem::size_of; use heed::types::ByteSlice; use roaring::RoaringBitmap; -use super::{Distinct, DocIter}; +use crate::error::InternalError; use crate::heed_codec::facet::*; -use crate::{DocumentId, FieldId, Index}; +use crate::index::db_name; +use crate::{DocumentId, FieldId, Index, Result}; +use super::{Distinct, DocIter}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -57,14 +59,17 @@ impl<'a> FacetDistinctIter<'a> { .get(self.txn, &(self.distinct, 0, key, key)) } - fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { + fn distinct_string(&mut self, id: DocumentId) -> Result<()> { let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; let facet_docids = self .facet_string_docids(value)? - .expect("Corrupted data: Facet values must exist"); + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_STRING_DOCIDS, + key: None, + })?; self.excluded.union_with(&facet_docids); } @@ -73,14 +78,17 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } - fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { + fn distinct_number(&mut self, id: DocumentId) -> Result<()> { let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; let facet_docids = self .facet_number_docids(value)? - .expect("Corrupted data: Facet values must exist"); + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_F64_DOCIDS, + key: None, + })?; self.excluded.union_with(&facet_docids); } @@ -92,7 +100,7 @@ impl<'a> FacetDistinctIter<'a> { /// Performs the next iteration of the facet distinct. This is a convenience method that is /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. - fn next_inner(&mut self) -> anyhow::Result> { + fn next_inner(&mut self) -> Result> { // The first step is to remove all the excluded documents from our candidates self.candidates.difference_with(&self.excluded); @@ -129,7 +137,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -146,7 +154,7 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -159,7 +167,7 @@ fn facet_string_values<'a>( } impl Iterator for FacetDistinctIter<'_> { - type Item = anyhow::Result; + type Item = Result; fn next(&mut self) -> Option { self.next_inner().transpose() diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 1b7c69c7a..99bc74be0 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -3,13 +3,13 @@ mod noop_distinct; use roaring::RoaringBitmap; -use crate::DocumentId; +use crate::{DocumentId, Result}; pub use facet_distinct::FacetDistinct; pub use noop_distinct::NoopDistinct; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { +pub trait DocIter: Iterator> { /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } @@ -106,7 +106,7 @@ mod test { /// Checks that all the candidates are distinct, and returns the candidates number. pub(crate) fn validate_distinct_candidates( - candidates: impl Iterator>, + candidates: impl Iterator>, distinct: FieldId, index: &Index, ) -> usize { diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index bfaafed85..812701794 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,6 +1,6 @@ use roaring::{RoaringBitmap, bitmap::IntoIter}; -use crate::DocumentId; +use crate::{DocumentId, Result}; use super::{DocIter, Distinct}; /// A distinct implementer that does not perform any distinct, @@ -13,7 +13,7 @@ pub struct NoopDistinctIter { } impl Iterator for NoopDistinctIter { - type Item = anyhow::Result; + type Item = Result; fn next(&mut self) -> Option { self.candidates.next().map(Ok) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 565f4c6dd..265a8ffeb 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,15 +2,15 @@ use std::collections::{HashSet, BTreeMap}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; -use anyhow::Context; use heed::{Database, BytesDecode}; use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; +use crate::error::FieldIdMapMissingEntry; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; -use crate::{Index, FieldId, DocumentId}; +use crate::{Index, FieldId, DocumentId, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -195,14 +195,15 @@ impl<'a> FacetDistribution<'a> { } } - pub fn execute(&self) -> anyhow::Result>> { + pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let filterable_fields = self.index.filterable_fields(self.rtxn)?; let mut distribution = BTreeMap::new(); for name in filterable_fields { - let fid = fields_ids_map.id(&name).with_context(|| { - format!("missing field name {:?} from the fields id map", name) + let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.clone(), + process: "FacetDistribution::execute", })?; let values = self.facet_values(fid)?; distribution.insert(name, values); diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f58443b6f..424118f77 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; +use std::result::Result as StdResult; use std::str::FromStr; use either::Either; @@ -11,8 +12,9 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; +use crate::error::UserError; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; +use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result}; use super::FacetRange; use super::parser::Rule; @@ -60,7 +62,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, array: I, - ) -> anyhow::Result> + ) -> Result> where I: IntoIterator>, J: IntoIterator, A: AsRef, @@ -104,11 +106,11 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, expression: &str, - ) -> anyhow::Result + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression)?; + let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -116,7 +118,7 @@ impl FilterCondition { fim: &FieldsIdsMap, ff: &HashSet, expression: Pairs, - ) -> anyhow::Result + ) -> Result { PREC_CLIMBER.climb( expression, @@ -133,7 +135,7 @@ impl FilterCondition { Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), _ => unreachable!(), }, - |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + |lhs: Result, op: Pair, rhs: Result| { match op.as_rule() { Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), @@ -158,16 +160,17 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); - let lvalue = lresult?; - let rvalue = rresult?; + let lvalue = lresult.map_err(UserError::InvalidFilter)?; + let rvalue = rresult.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, Between(lvalue, rvalue))) } @@ -176,10 +179,11 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); @@ -192,60 +196,68 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; - Ok(Operator(fid, GreaterThan(result?))) + Ok(Operator(fid, GreaterThan(value))) } fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; - Ok(Operator(fid, GreaterThanOrEqual(result?))) + Ok(Operator(fid, GreaterThanOrEqual(value))) } fn lower_than( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; - Ok(Operator(fid, LowerThan(result?))) + Ok(Operator(fid, LowerThan(value))) } fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; - Ok(Operator(fid, LowerThanOrEqual(result?))) + Ok(Operator(fid, LowerThanOrEqual(value))) } } @@ -260,7 +272,7 @@ impl FilterCondition { left: Bound, right: Bound, output: &mut RoaringBitmap, - ) -> anyhow::Result<()> + ) -> Result<()> { match (left, right) { // If the request is an exact value we must go directly to the deepest level. @@ -332,7 +344,7 @@ impl FilterCondition { strings_db: heed::Database, field_id: FieldId, operator: &Operator, - ) -> anyhow::Result + ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the @@ -390,7 +402,7 @@ impl FilterCondition { &self, rtxn: &heed::RoTxn, index: &Index, - ) -> anyhow::Result + ) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; @@ -422,7 +434,7 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> Result> +) -> StdResult> { // lexing ensures that we at least have a key let key = items.next().unwrap(); @@ -463,7 +475,7 @@ fn field_id( /// the original string that we tried to parse. /// /// Returns the parsing error associated with the span if the conversion fails. -fn pest_parse(pair: Pair) -> (Result>, String) +fn pest_parse(pair: Pair) -> (StdResult>, String) where T: FromStr, T::Err: ToString, { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index a5e02fc9f..a1a03dba3 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -9,8 +9,9 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; -pub use self::filter_condition::{FilterCondition, Operator}; pub use self::facet_distribution::FacetDistribution; +pub use self::filter_condition::{FilterCondition, Operator}; +pub(crate) use self::parser::Rule as ParserRule; mod filter_condition; mod facet_distribution; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 872ebfca6..3c85796bc 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::hash_map::{Entry, HashMap}; use std::fmt; use std::mem::take; +use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; @@ -12,14 +13,17 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; +use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{Index, DocumentId}; +use crate::{Index, DocumentId, Result}; pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; pub use self::matching_words::MatchingWords; +pub(crate) use self::facet::ParserRule; use self::query_tree::QueryTreeBuilder; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; + // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); @@ -93,7 +97,7 @@ impl<'a> Search<'a> { self } - pub fn execute(&self) -> anyhow::Result { + pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); let (query_tree, primitive_query) = match self.query.as_ref() { @@ -140,7 +144,10 @@ impl<'a> Search<'a> { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - let id = field_ids_map.id(name).expect("distinct not present in field map"); + let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "distinct attribute", + })?; let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } @@ -152,7 +159,7 @@ impl<'a> Search<'a> { mut distinct: D, matching_words: MatchingWords, mut criteria: Final, - ) -> anyhow::Result + ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); @@ -225,7 +232,7 @@ pub fn word_derivations<'c>( max_typo: u8, fst: &fst::Set>, cache: &'c mut WordDerivationsCache, -) -> Result<&'c [(String, u8)], Utf8Error> { +) -> StdResult<&'c [(String, u8)], Utf8Error> { match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 3c3420db4..c371b07d4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -7,7 +7,7 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::Index; +use crate::{Index, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -219,7 +219,7 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> anyhow::Result> { + pub fn build(&self, query: TokenStream) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -291,14 +291,14 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: &[PrimitiveQueryPart], -) -> anyhow::Result +) -> Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - ) -> anyhow::Result + ) -> Result { match part { // 1. try to split word in 2 @@ -325,7 +325,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result + ) -> Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); @@ -379,7 +379,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: PrimitiveQuery, - ) -> anyhow::Result + ) -> Result { let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let mut operation_children = Vec::new(); @@ -532,7 +532,7 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> anyhow::Result> + ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f4c13e8f8..6e26bf027 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,6 +1,7 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, Index, FieldsDistribution}; + +use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -18,7 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { ClearDocuments { wtxn, index, _update_id: update_id } } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> Result { self.index.set_updated_at(self.wtxn, &Utc::now())?; let Index { env: _env, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c4cf132bb..7fc7e5d77 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,15 +1,16 @@ use std::collections::HashMap; use std::collections::hash_map::Entry; -use anyhow::{anyhow, Context}; use chrono::Utc; use fst::IntoStreamer; use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; +use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds}; +use crate::index::{db_name, main_key}; +use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -25,7 +26,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> anyhow::Result> + ) -> Result> { let external_documents_ids = index .external_documents_ids(wtxn)? @@ -54,7 +55,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> Result { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -77,8 +78,18 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let primary_key = self.index.primary_key(self.wtxn)?.context("missing primary key")?; - let id_field = fields_ids_map.id(primary_key).expect(r#"the field "id" to be present"#); + let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { + InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + } + })?; + let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { + FieldIdMapMissingEntry::FieldName { + field_name: primary_key.to_string(), + process: "DeleteDocuments::execute", + } + })?; let Index { env: _env, @@ -119,7 +130,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), Value::Number(number) => SmallString32::from(number.to_string()), - _ => return Err(anyhow!("documents ids must be either strings or numbers")), + document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), }; external_ids.push(external_id); } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index f0eab6023..757cbe810 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -9,11 +9,12 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use crate::error::InternalError; use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::{Index, Result}; pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -55,7 +56,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -102,7 +103,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), content, - |_, _| anyhow::bail!("invalid facet number level merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), WriteMethod::GetMergePut, )?; } @@ -132,7 +133,7 @@ fn compute_facet_number_levels<'t>( level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: u8, -) -> anyhow::Result> +) -> Result> { let first_level_size = db .remap_key_type::() @@ -195,7 +196,7 @@ fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, -) -> anyhow::Result +) -> Result { let mut documents_ids = RoaringBitmap::new(); @@ -214,7 +215,7 @@ fn write_number_entry( left: f64, right: f64, ids: &RoaringBitmap, -) -> anyhow::Result<()> +) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 230116e99..3d9ffda6a 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -1,75 +1,43 @@ use std::borrow::Cow; +use std::result::Result as StdResult; -use anyhow::{bail, ensure, Context}; -use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::Result; -const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); -const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); -const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); - -pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - match key { - WORDS_FST_KEY => { - let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); - - // Union of the FSTs - let mut op = fst::set::OpBuilder::new(); - fsts.iter().for_each(|fst| op.push(fst.into_stream())); - let op = op.r#union(); - - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - Ok(build.into_inner().unwrap()) - }, - FIELDS_IDS_MAP_KEY => { - ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); - Ok(values[0].to_vec()) - }, - DOCUMENTS_IDS_KEY => roaring_bitmap_merge(values), - otherwise => bail!("wut {:?}", otherwise), - } +/// Only the last value associated with an id is kept. +pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { + Ok(obkvs.last().unwrap().clone().into_owned()) } -pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - roaring_bitmap_merge(values) +/// Merge all the obks in the order we see them. +pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { + let mut iter = obkvs.iter(); + let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); + Ok(iter.fold(first, |acc, current| { + let first = obkv::KvReader::new(&acc); + let second = obkv::KvReader::new(current); + let mut buffer = Vec::new(); + merge_two_obkvs(first, second, &mut buffer); + buffer + })) } -pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { - bail!("merging docid word positions is an error ({:?})", key.as_bstr()) +// Union of multiple FSTs +pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; + let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); + let op = op_builder.r#union(); + + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + Ok(build.into_inner().unwrap()) } -pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let first = values.first().context("no value to merge")?; - ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging"); - Ok(first.to_vec()) -} - -pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { - bail!("merging documents is an error ({:?})", key.as_bstr()) +pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + Ok(values.first().unwrap().to_vec()) } pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec) { @@ -88,7 +56,7 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu writer.finish().unwrap(); } -fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = RoaringBitmap::deserialize_from(&head[..])?; @@ -102,7 +70,7 @@ fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { Ok(vec) } -fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; @@ -112,6 +80,6 @@ fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { } let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; + CboRoaringBitmapCodec::serialize_into(&head, &mut vec); Ok(vec) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4cf56b563..51c8b948a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3,11 +3,11 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; use std::num::{NonZeroU32, NonZeroUsize}; +use std::result::Result as StdResult; use std::str; use std::sync::mpsc::sync_channel; use std::time::Instant; -use anyhow::Context; use bstr::ByteSlice as _; use chrono::Utc; use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; @@ -18,18 +18,15 @@ use rayon::prelude::*; use rayon::ThreadPool; use serde::{Serialize, Deserialize}; -use crate::index::Index; +use crate::error::{Error, InternalError}; +use crate::{Index, Result}; use crate::update::{ Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, WordPrefixPairProximityDocids, }; use self::store::{Store, Readers}; pub use self::merge_function::{ - main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, - word_level_position_docids_merge, word_prefix_level_positions_docids_merge, - facet_field_value_docids_merge, field_id_docid_facet_values_merge, - field_id_word_count_docids_merge, + fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first }; pub use self::transform::{Transform, TransformOutput}; @@ -60,14 +57,14 @@ pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io builder.build(file) } -pub fn create_sorter( - merge: MergeFn, +pub fn create_sorter( + merge: MergeFn, chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, max_nb_chunks: Option, max_memory: Option, -) -> Sorter +) -> Sorter> { let mut builder = Sorter::builder(merge); if let Some(shrink_size) = chunk_fusing_shrink_size { @@ -86,7 +83,7 @@ pub fn create_sorter( builder.build() } -pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { +pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -97,19 +94,25 @@ pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> any Reader::new(file).map_err(Into::into) } -pub fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { +pub fn merge_readers( + sources: Vec>, + merge: MergeFn, +) -> Merger> +{ let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() } -pub fn merge_into_lmdb_database( +pub fn merge_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, sources: Vec>, - merge: MergeFn, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { debug!("Merging {} MTBL stores...", sources.len()); let before = Instant::now(); @@ -127,13 +130,15 @@ pub fn merge_into_lmdb_database( Ok(()) } -pub fn write_into_lmdb_database( +pub fn write_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, mut reader: Reader, - merge: MergeFn, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { debug!("Writing MTBL stores..."); let before = Instant::now(); @@ -142,9 +147,7 @@ pub fn write_into_lmdb_database( WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = reader.next()? { - out_iter.append(k, v).with_context(|| { - format!("writing {:?} into LMDB", k.as_bstr()) - })?; + out_iter.append(k, v)?; } }, WriteMethod::GetMergePut => { @@ -152,7 +155,7 @@ pub fn write_into_lmdb_database( let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; match iter.next().transpose()? { Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let val = merge(k, &vals)?; iter.put_current(k, &val)?; }, @@ -169,13 +172,16 @@ pub fn write_into_lmdb_database( Ok(()) } -pub fn sorter_into_lmdb_database( +pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - sorter: Sorter, - merge: MergeFn, + sorter: Sorter>, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, + Error: From> { debug!("Writing MTBL sorter..."); let before = Instant::now(); @@ -192,21 +198,21 @@ pub fn sorter_into_lmdb_database( Ok(()) } -fn merger_iter_into_lmdb_database( +fn merger_iter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - mut sorter: MergerIter, - merge: MergeFn, + mut sorter: MergerIter>, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { match method { WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = sorter.next()? { - out_iter.append(k, v).with_context(|| { - format!("writing {:?} into LMDB", k.as_bstr()) - })?; + out_iter.append(k, v)?; } }, WriteMethod::GetMergePut => { @@ -215,7 +221,10 @@ fn merger_iter_into_lmdb_database( match iter.next().transpose()? { Some((key, old_val)) if key == k => { let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).expect("merge failed"); + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; iter.put_current(k, &val)?; }, _ => { @@ -322,7 +331,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.autogenerate_docids = false; } - pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn execute(self, reader: R, progress_callback: F) -> Result where R: io::Read, F: Fn(UpdateIndexingStep, u64) + Sync, @@ -369,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Ok(DocumentAdditionResult { nb_documents }) } - pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> + pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync { @@ -407,15 +416,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("{} documents actually deleted", deleted_documents_count); } - let mmap; - let bytes = if documents_count == 0 { - &[][..] - } else { - mmap = unsafe { Mmap::map(&documents_file).context("mmaping the transform documents file")? }; - &mmap - }; + if documents_count == 0 { + return Ok(()); + } - let documents = grenad::Reader::new(bytes).unwrap(); + let bytes = unsafe { Mmap::map(&documents_file)? }; + let documents = grenad::Reader::new(bytes.as_bytes()).unwrap(); // The enum which indicates the type of the readers // merges that are potentially done on different threads. @@ -481,7 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { &progress_callback, ) }) - .collect::, _>>()?; + .collect::, _>>()?; let mut main_readers = Vec::with_capacity(readers.len()); let mut word_docids_readers = Vec::with_capacity(readers.len()); @@ -539,22 +545,22 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); rayon::spawn(move || { vec![ - (DatabaseType::Main, main_readers, main_merge as MergeFn), - (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), + (DatabaseType::Main, main_readers, fst_merge as MergeFn<_>), + (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), ( DatabaseType::FacetLevel0NumbersDocids, facet_field_numbers_docids_readers, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, ), ( DatabaseType::WordLevel0PositionDocids, word_level_position_docids_readers, - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, ), ( DatabaseType::FieldIdWordCountDocids, field_id_word_count_docids_readers, - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, ), ] .into_par_iter() @@ -574,7 +580,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, - )) as anyhow::Result<_> + )) as Result<_> })?; let ( @@ -622,12 +628,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the docid word positions into LMDB on disk..."); + debug!("Inserting the docid word positions into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, *self.index.docid_word_positions.as_polymorph(), docid_word_positions_readers, - docid_word_positions_merge, + keep_first, write_method )?; @@ -637,12 +643,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the documents into LMDB on disk..."); + debug!("Inserting the documents into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, *self.index.documents.as_polymorph(), documents_readers, - documents_merge, + keep_first, write_method )?; @@ -657,7 +663,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; @@ -672,7 +678,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.field_id_docid_facet_f64s.as_polymorph(), field_id_docid_facet_numbers_readers, - field_id_docid_facet_values_merge, + keep_first, write_method, )?; @@ -687,7 +693,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.field_id_docid_facet_strings.as_polymorph(), field_id_docid_facet_strings_readers, - field_id_docid_facet_values_merge, + keep_first, write_method, )?; @@ -702,7 +708,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.word_pair_proximity_docids.as_polymorph(), words_pairs_proximities_docids_readers, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; @@ -721,7 +727,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, self.index.main, content, - main_merge, + fst_merge, WriteMethod::GetMergePut, )?; }, @@ -732,7 +738,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - word_docids_merge, + roaring_bitmap_merge, write_method, )?; }, @@ -743,7 +749,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; }, @@ -754,7 +760,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; }, @@ -765,7 +771,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 08050092e..94ae12108 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -6,7 +6,6 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::Context; use bstr::ByteSlice as _; use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; @@ -19,24 +18,21 @@ use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; +use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{ - main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, field_id_word_count_docids_merge, -}; +use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; const MAX_POSITION: usize = 1000; -const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); +const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes(); pub struct Readers { pub main: Reader, @@ -70,15 +66,15 @@ pub struct Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, // MTBL sorters - main_sorter: Sorter, - word_docids_sorter: Sorter, - words_pairs_proximities_docids_sorter: Sorter, - word_level_position_docids_sorter: Sorter, - field_id_word_count_docids_sorter: Sorter, - facet_field_numbers_docids_sorter: Sorter, - facet_field_strings_docids_sorter: Sorter, - field_id_docid_facet_numbers_sorter: Sorter, - field_id_docid_facet_strings_sorter: Sorter, + main_sorter: Sorter>, + word_docids_sorter: Sorter>, + words_pairs_proximities_docids_sorter: Sorter>, + word_level_position_docids_sorter: Sorter>, + field_id_word_count_docids_sorter: Sorter>, + facet_field_numbers_docids_sorter: Sorter>, + facet_field_strings_docids_sorter: Sorter>, + field_id_docid_facet_numbers_sorter: Sorter>, + field_id_docid_facet_strings_sorter: Sorter>, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -97,14 +93,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, stop_words: Option<&'s Set>, - ) -> anyhow::Result + ) -> Result { // We divide the max memory by the number of sorter the Store have. let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( - main_merge, + fst_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -112,7 +108,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let word_docids_sorter = create_sorter( - word_docids_merge, + roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -120,7 +116,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let words_pairs_proximities_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -128,7 +124,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let word_level_position_docids_sorter = create_sorter( - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -136,7 +132,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let field_id_word_count_docids_sorter = create_sorter( - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -144,7 +140,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_numbers_docids_sorter = create_sorter( - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -152,7 +148,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -160,7 +156,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let field_id_docid_facet_numbers_sorter = create_sorter( - field_id_docid_facet_values_merge, + keep_first, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -168,7 +164,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Some(1024 * 1024 * 1024), // 1MB ); let field_id_docid_facet_strings_sorter = create_sorter( - field_id_docid_facet_values_merge, + keep_first, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -225,7 +221,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } // Save the documents ids under the position and word we have seen it. - fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { + fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.word_docids.get_refresh(word.as_bytes()) { Some(old) => { old.insert(id); }, @@ -250,7 +246,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: OrderedFloat, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_numbers_sorter; Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; @@ -283,7 +279,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: String, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; @@ -315,7 +311,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self, words_pairs_proximities: impl IntoIterator, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { for ((w1, w2), prox) in words_pairs_proximities { let w1 = SmallVec32::from(w1.as_bytes()); @@ -354,7 +350,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { facet_numbers_values: &mut HashMap>, facet_strings_values: &mut HashMap>, record: &[u8], - ) -> anyhow::Result<()> + ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. let words_pair_proximities = compute_words_pair_proximities(&words_positions); @@ -389,10 +385,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_words_pairs_proximities( - sorter: &mut Sorter, + fn write_words_pairs_proximities( + sorter: &mut Sorter>, iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let mut key = Vec::new(); let mut buffer = Vec::new(); @@ -407,7 +405,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We serialize the document ids into a buffer buffer.clear(); buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); - CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer)?; + CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer); // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; @@ -421,10 +419,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { writer: &mut Writer, id: DocumentId, words_positions: &HashMap>, - ) -> anyhow::Result<()> + ) -> Result<()> { // We prefix the words by the document id. let mut key = id.to_be_bytes().to_vec(); + let mut buffer = Vec::new(); let base_size = key.len(); // We order the words lexicographically, this way we avoid passing by a sorter. @@ -433,24 +432,28 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (word, positions) in words_positions { key.truncate(base_size); key.extend_from_slice(word.as_bytes()); + buffer.clear(); + // We serialize the positions into a buffer. let positions = RoaringBitmap::from_iter(positions.iter().cloned()); - let bytes = BoRoaringBitmapCodec::bytes_encode(&positions) - .with_context(|| "could not serialize positions")?; + BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer); + // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { - writer.insert(&key, &bytes)?; + writer.insert(&key, &buffer)?; } } Ok(()) } - fn write_word_position_docids( - writer: &mut Sorter, + fn write_word_position_docids( + writer: &mut Sorter>, document_id: DocumentId, words_positions: &HashMap>, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); @@ -469,8 +472,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { data_buffer.clear(); let positions = RoaringBitmap::from_iter(Some(document_id)); // We serialize the positions into a buffer. - CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) - .with_context(|| "could not serialize positions")?; + CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer); // that we write under the generated key into MTBL if lmdb_key_valid_size(&key_buffer) { @@ -482,56 +484,71 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_string_docids( - sorter: &mut Sorter, + fn write_facet_field_string_docids( + sorter: &mut Sorter>, iter: I, - ) -> anyhow::Result<()> - where I: IntoIterator + ) -> Result<()> + where + I: IntoIterator, + Error: From, { + let mut key_buffer = Vec::new(); + let mut data_buffer = Vec::new(); + for ((field_id, value), docids) in iter { - let key = FacetValueStringCodec::bytes_encode(&(field_id, &value)) - .map(Cow::into_owned) - .context("could not serialize facet key")?; - let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) - .context("could not serialize docids")?; - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &bytes)?; + key_buffer.clear(); + data_buffer.clear(); + + FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer); + CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + + if lmdb_key_valid_size(&key_buffer) { + sorter.insert(&key_buffer, &data_buffer)?; } } Ok(()) } - fn write_facet_field_number_docids( - sorter: &mut Sorter, + fn write_facet_field_number_docids( + sorter: &mut Sorter>, iter: I, - ) -> anyhow::Result<()> - where I: IntoIterator), RoaringBitmap)> + ) -> Result<()> + where + I: IntoIterator), RoaringBitmap)>, + Error: From, { + let mut data_buffer = Vec::new(); + for ((field_id, value), docids) in iter { + data_buffer.clear(); + let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) .map(Cow::into_owned) - .context("could not serialize facet key")?; - let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) - .context("could not serialize docids")?; + .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; + + CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + if lmdb_key_valid_size(&key) { - sorter.insert(&key, &bytes)?; + sorter.insert(&key, &data_buffer)?; } } Ok(()) } - fn write_field_id_docid_facet_number_value( - sorter: &mut Sorter, + fn write_field_id_docid_facet_number_value( + sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, value: OrderedFloat, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) .map(Cow::into_owned) - .context("could not serialize facet key")?; + .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; if lmdb_key_valid_size(&key) { sorter.insert(&key, &[])?; @@ -540,26 +557,30 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_field_id_docid_facet_string_value( - sorter: &mut Sorter, + fn write_field_id_docid_facet_string_value( + sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, value: &str, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { - let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value)) - .map(Cow::into_owned) - .context("could not serialize facet key")?; + let mut buffer = Vec::new(); - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &[])?; + FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + + if lmdb_key_valid_size(&buffer) { + sorter.insert(&buffer, &[])?; } Ok(()) } - fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> - where I: IntoIterator, RoaringBitmap)> + fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> + where + I: IntoIterator, RoaringBitmap)>, + Error: From, { let mut key = Vec::new(); let mut buffer = Vec::new(); @@ -589,7 +610,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { num_threads: usize, log_every_n: Option, mut progress_callback: F, - ) -> anyhow::Result + ) -> Result where F: FnMut(UpdateIndexingStep), { debug!("{:?}: Indexing in a Store...", thread_index); @@ -618,7 +639,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (attr, content) in document.iter() { if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { - let value = serde_json::from_slice(content)?; + let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; let (facet_numbers, facet_strings) = extract_facet_values(&value); facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); @@ -672,7 +693,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(readers) } - fn finish(mut self) -> anyhow::Result { + fn finish(mut self) -> Result { let comp_type = self.chunk_compression_type; let comp_level = self.chunk_compression_level; let shrink_size = self.chunk_fusing_shrink_size; @@ -706,7 +727,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut docids_buffer = Vec::new(); for ((fid, count), docids) in self.field_id_word_count_docids { docids_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?; + CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index fd508d6a4..9e88559d0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -2,16 +2,20 @@ use std::borrow::Cow; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; +use std::result::Result as StdResult; use std::time::Instant; -use anyhow::{anyhow, Context}; use grenad::CompressionType; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; +use crate::error::{Error, UserError, InternalError}; +use crate::index::db_name; +use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; +use crate::{Index, Result}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -52,7 +56,7 @@ fn is_primary_key(field: impl AsRef) -> bool { } impl Transform<'_, '_> { - pub fn output_from_json(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_json(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -60,7 +64,7 @@ impl Transform<'_, '_> { self.output_from_generic_json(reader, false, progress_callback) } - pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -73,7 +77,7 @@ impl Transform<'_, '_> { reader: R, is_stream: bool, progress_callback: F, - ) -> anyhow::Result + ) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -87,7 +91,7 @@ impl Transform<'_, '_> { let iter = Box::new(iter) as Box>; iter.peekable() } else { - let vec: Vec<_> = serde_json::from_reader(reader)?; + let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; let iter = vec.into_iter().map(Ok); let iter = Box::new(iter) as Box>; iter.peekable() @@ -95,9 +99,12 @@ impl Transform<'_, '_> { // We extract the primary key from the first document in // the batch if it hasn't already been defined in the index - let first = match documents.peek().map(Result::as_ref).transpose() { + let first = match documents.peek().map(StdResult::as_ref).transpose() { Ok(first) => first, - Err(_) => return Err(documents.next().unwrap().unwrap_err().into()), + Err(_) => { + let error = documents.next().unwrap().unwrap_err(); + return Err(UserError::SerdeJson(error).into()); + }, }; let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); @@ -144,7 +151,7 @@ impl Transform<'_, '_> { let mut documents_count = 0; for result in documents { - let document = result?; + let document = result.map_err(UserError::SerdeJson)?; if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { @@ -157,7 +164,7 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { - fields_ids_map.insert(&key).context("field id limit reached")?; + fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; } // We retrieve the user id from the document based on the primary key name, @@ -166,11 +173,13 @@ impl Transform<'_, '_> { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), - _ => return Err(anyhow!("documents ids must be either strings or numbers")), + content => return Err(UserError::InvalidDocumentId { + document_id: content.clone(), + }.into()), }, None => { if !self.autogenerate_docids { - return Err(anyhow!("missing primary key")); + return Err(UserError::MissingPrimaryKey.into()); } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) @@ -185,13 +194,15 @@ impl Transform<'_, '_> { // and this should be the document id we return the one we generated. if let Some(value) = document.get(name) { // We serialize the attribute values. - serde_json::to_writer(&mut json_buffer, value)?; + serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } // We validate the document id [a-zA-Z0-9\-_]. if field_id == primary_key_id && validate_document_id(&external_id).is_none() { - return Err(anyhow!("invalid document id: {:?}", external_id)); + return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + }.into()); } } @@ -216,7 +227,7 @@ impl Transform<'_, '_> { ) } - pub fn output_from_csv(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_csv(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -225,12 +236,12 @@ impl Transform<'_, '_> { let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); - let headers = csv.headers()?; + let headers = csv.headers().map_err(UserError::Csv)?; let mut fields_ids = Vec::new(); // Generate the new fields ids based on the current fields ids and this CSV headers. for (i, header) in headers.iter().enumerate() { - let id = fields_ids_map.insert(header).context("field id limit reached)")?; + let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?; fields_ids.push((id, i)); } @@ -246,7 +257,7 @@ impl Transform<'_, '_> { // Returns the field id in the fields ids map, create an "id" field // in case it is not in the current headers. let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string()); - let (primary_key_id, _) = compute_primary_key_pair( + let (primary_key_id, primary_key_name) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, @@ -280,7 +291,7 @@ impl Transform<'_, '_> { let mut documents_count = 0; let mut record = csv::StringRecord::new(); - while csv.read_record(&mut record)? { + while csv.read_record(&mut record).map_err(UserError::Csv)? { obkv_buffer.clear(); let mut writer = obkv::KvWriter::new(&mut obkv_buffer); @@ -297,7 +308,9 @@ impl Transform<'_, '_> { // We validate the document id [a-zA-Z0-9\-_]. match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", external_id)), + None => return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + }.into()), } }, None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), @@ -315,7 +328,7 @@ impl Transform<'_, '_> { for (field_id, field) in iter { // We serialize the attribute values as JSON strings. json_buffer.clear(); - serde_json::to_writer(&mut json_buffer, &field)?; + serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; writer.insert(*field_id, &json_buffer)?; } @@ -330,10 +343,6 @@ impl Transform<'_, '_> { // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. - let primary_key_name = fields_ids_map - .name(primary_key_id) - .map(String::from) - .expect("Primary key must be present in fields id map"); self.output_from_sorter( sorter, primary_key_name, @@ -347,17 +356,18 @@ impl Transform<'_, '_> { /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn output_from_sorter( + fn output_from_sorter( self, - sorter: grenad::Sorter, + sorter: grenad::Sorter>, primary_key: String, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, - ) -> anyhow::Result + ) -> Result where F: Fn(UpdateIndexingStep) + Sync, + Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; @@ -365,7 +375,7 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_docid, _obkvs| Err(anyhow!("cannot merge two documents")), + |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -401,7 +411,10 @@ impl Transform<'_, '_> { IndexDocumentsMethod::UpdateDocuments => { let key = BEU32::new(docid); let base_obkv = self.index.documents.get(&self.rtxn, &key)? - .context("document not found")?; + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) @@ -412,7 +425,7 @@ impl Transform<'_, '_> { // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. let new_docid = available_documents_ids.next() - .context("no more available documents ids")?; + .ok_or(UserError::DocumentLimitReached)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) @@ -472,7 +485,7 @@ impl Transform<'_, '_> { primary_key: String, old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, - ) -> anyhow::Result + ) -> Result { let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; @@ -532,10 +545,10 @@ fn compute_primary_key_pair( fields_ids_map: &mut FieldsIdsMap, alternative_name: Option, autogenerate_docids: bool, -) -> anyhow::Result<(FieldId, String)> { +) -> Result<(FieldId, String)> { match primary_key { Some(primary_key) => { - let id = fields_ids_map.insert(primary_key).ok_or(anyhow!("Maximum number of fields exceeded"))?; + let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; Ok((id, primary_key.to_string())) } None => { @@ -545,35 +558,17 @@ fn compute_primary_key_pair( if !autogenerate_docids { // If there is no primary key in the current document batch, we must // return an error and not automatically generate any document id. - anyhow::bail!("missing primary key") + return Err(UserError::MissingPrimaryKey.into()); } DEFAULT_PRIMARY_KEY_NAME.to_string() }, }; - let id = fields_ids_map.insert(&name).context("field id limit reached")?; + let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; Ok((id, name)) }, } } -/// Only the last value associated with an id is kept. -fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { - obkvs.last().context("no last value").map(|last| last.clone().into_owned()) -} - -/// Merge all the obks in the order we see them. -fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { - let mut iter = obkvs.iter(); - let first = iter.next().map(|b| b.clone().into_owned()).context("no first value")?; - Ok(iter.fold(first, |acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(current); - let mut buffer = Vec::new(); - merge_two_obkvs(first, second, &mut buffer); - buffer - })) -} - fn validate_document_id(document_id: &str) -> Option<&str> { let document_id = document_id.trim(); Some(document_id).filter(|id| { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1c687e089..8f4fe48c9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; +use std::result::Result as StdResult; -use anyhow::Context; use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; @@ -9,9 +9,10 @@ use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::criterion::Criterion; +use crate::error::UserError; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; -use crate::{FieldsIdsMap, Index}; +use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -33,7 +34,7 @@ impl Setting { } impl Serialize for Setting { - fn serialize(&self, serializer: S) -> Result where S: Serializer { + fn serialize(&self, serializer: S) -> StdResult where S: Serializer { match self { Self::Set(value) => Some(value), // Usually not_set isn't serialized by setting skip_serializing_if field attribute @@ -43,7 +44,7 @@ impl Serialize for Setting { } impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { - fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { + fn deserialize(deserializer: D) -> StdResult where D: Deserializer<'de> { Deserialize::deserialize(deserializer).map(|x| match x { Some(x) => Self::Set(x), None => Self::Reset, // Reset is forced by sending null value @@ -71,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { stop_words: Setting>, distinct_field: Setting, synonyms: Setting>>, + primary_key: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -97,6 +99,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { stop_words: Setting::NotSet, distinct_field: Setting::NotSet, synonyms: Setting::NotSet, + primary_key: Setting::NotSet, update_id, } } @@ -165,7 +168,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> + pub fn reset_primary_key(&mut self) { + self.primary_key = Setting::Reset; + } + + pub fn set_primary_key(&mut self, primary_key: String) { + self.primary_key = Setting::Set(primary_key); + } + + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync { @@ -192,7 +203,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // There already has been a document addition, the primary key should be set by now. - let primary_key = self.index.primary_key(&self.wtxn)?.context("Index must have a primary key")?; + let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( @@ -220,7 +231,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_displayed(&mut self) -> anyhow::Result { + fn update_displayed(&mut self) -> Result { match self.displayed_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -234,7 +245,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in names.iter() { fields_ids_map .insert(name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -245,13 +256,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_distinct_field(&mut self) -> anyhow::Result { + fn update_distinct_field(&mut self) -> Result { match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map .insert(attr) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -264,7 +275,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. - fn update_searchable(&mut self) -> anyhow::Result { + fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the @@ -285,13 +296,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in names.iter() { new_fields_ids_map .insert(&name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { new_fields_ids_map .insert(&name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } self.index.put_searchable_fields(self.wtxn, &names)?; @@ -303,7 +314,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_stop_words(&mut self) -> anyhow::Result { + fn update_stop_words(&mut self) -> Result { match self.stop_words { Setting::Set(ref stop_words) => { let current = self.index.stop_words(self.wtxn)?; @@ -325,7 +336,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_synonyms(&mut self) -> anyhow::Result { + fn update_synonyms(&mut self) -> Result { match self.synonyms { Setting::Set(ref synonyms) => { fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { @@ -383,13 +394,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_filterable(&mut self) -> anyhow::Result<()> { + fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_facets = HashSet::new(); for name in fields { - fields_ids_map.insert(name).context("field id limit exceeded")?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; new_facets.insert(name.clone()); } self.index.put_filterable_fields(self.wtxn, &new_facets)?; @@ -401,7 +412,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_criteria(&mut self) -> anyhow::Result<()> { + fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -409,7 +420,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in fields { let criterion: Criterion = name.parse()?; if let Some(name) = criterion.field_name() { - fields_ids_map.insert(name).context("field id limit exceeded")?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } new_criteria.push(criterion); } @@ -422,7 +433,32 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> + fn update_primary_key(&mut self) -> Result<()> { + match self.primary_key { + Setting::Set(ref primary_key) => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_primary_key(self.wtxn, primary_key)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeChanged.into()) + } + }, + Setting::Reset => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + self.index.delete_primary_key(self.wtxn)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeReset.into()) + } + }, + Setting::NotSet => Ok(()), + } + } + + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync { @@ -435,6 +471,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_filterable()?; self.update_distinct_field()?; self.update_criteria()?; + self.update_primary_key()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -461,8 +498,9 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; - use crate::{Criterion, FilterCondition, SearchResult}; + use crate::error::Error; use crate::update::{IndexDocuments, UpdateFormat}; + use crate::{Criterion, FilterCondition, SearchResult}; use super::*; @@ -976,4 +1014,56 @@ mod tests { let rtxn = index.read_txn().unwrap(); FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); } + + #[test] + fn setting_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the primary key settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("mykey")); + + builder.execute(|_, _| ()).unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + + // Then index some documents with the "mykey" primary key. + let content = &br#"[ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.disable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // We now try to reset the primary key + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_primary_key(); + + let err = builder.execute(|_, _| ()).unwrap_err(); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + wtxn.abort().unwrap(); + + // But if we clear the database... + let mut wtxn = index.write_txn().unwrap(); + let builder = ClearDocuments::new(&mut wtxn, &index, 0); + builder.execute().unwrap(); + + // ...we can change the primary key + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("myid")); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + } } diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 8d6eb034d..1d0e776b1 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -1,7 +1,7 @@ use grenad::CompressionType; use rayon::ThreadPool; -use crate::Index; +use crate::{Index, Result}; use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { @@ -76,7 +76,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> anyhow::Result> + ) -> Result> { DeleteDocuments::new(wtxn, index, self.update_id) } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 58c984212..a2197b28c 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,8 +5,11 @@ use fst::Streamer; use grenad::CompressionType; use heed::types::ByteSlice; +use crate::Result; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; +use crate::update::index_documents::{ + create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, +}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -31,7 +34,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { // Clear the word prefix docids database. self.index.word_prefix_docids.clear(self.wtxn)?; @@ -40,7 +43,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( - word_docids_merge, + roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -66,7 +69,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - word_docids_merge, + roaring_bitmap_merge, WriteMethod::Append, )?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index c972efc4f..9019b26e5 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -7,11 +7,11 @@ use heed::BytesEncode; use heed::types::ByteSlice; use log::debug; -use crate::Index; +use crate::{Index, Result}; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ WriteMethod, create_sorter, sorter_into_lmdb_database, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, }; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; @@ -50,7 +50,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -80,7 +80,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, WriteMethod::Append, )?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 1b772c37d..e2e3f7b4c 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -11,11 +11,13 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use crate::error::InternalError; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; +use crate::Result; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, - word_prefix_level_positions_docids_merge, sorter_into_lmdb_database + cbo_roaring_bitmap_merge, sorter_into_lmdb_database }; use crate::{Index, TreeLevel}; @@ -56,7 +58,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); let entries = compute_positions_levels( @@ -78,7 +80,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid word level position merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }), WriteMethod::Append, )?; @@ -86,7 +88,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids.clear(self.wtxn)?; let mut word_prefix_level_positions_docids_sorter = create_sorter( - word_prefix_level_positions_docids_merge, + cbo_roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -119,7 +121,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), word_prefix_level_positions_docids_sorter, - word_prefix_level_positions_docids_merge, + cbo_roaring_bitmap_merge, WriteMethod::Append, )?; @@ -142,7 +144,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid word prefix level position merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), WriteMethod::Append, )?; @@ -174,7 +176,7 @@ fn compute_positions_levels( shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> anyhow::Result> +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. @@ -251,7 +253,7 @@ fn write_level_entry( left: u32, right: u32, ids: &RoaringBitmap, -) -> anyhow::Result<()> +) -> Result<()> { let key = (word, level, left, right); let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index f53b0ee00..d1aa267b8 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -2,7 +2,7 @@ use std::iter::FromIterator; use std::str; use fst::Streamer; -use crate::{Index, SmallString32}; +use crate::{Index, SmallString32, Result}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -48,7 +48,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; let number_of_words = words_fst.len(); let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f0eecfaba..2b9c5ae5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,5 +1,6 @@ -use milli::{Search, SearchResult, Criterion}; use big_s::S; +use milli::update::Settings; +use milli::{Search, SearchResult, Criterion}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use Criterion::*; @@ -189,7 +190,9 @@ fn criteria_mixup() { eprintln!("Testing with criteria order: {:?}", &criteria); //update criteria let mut wtxn = index.write_txn().unwrap(); - index.put_criteria(&mut wtxn, &criteria).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap();