From b27f632e1453b6ebc95dde1945759c9b4bc9a029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 28 Dec 2018 16:15:22 +0100 Subject: [PATCH] feat: Make the Attribute and WordArea errors recoverable --- src/data/doc_indexes.rs | 36 ++++++++++-- src/database/blob/positive/blob.rs | 36 ++++++++++-- src/database/update/positive/update.rs | 4 +- src/lib.rs | 81 +++++++++++++++++++++----- src/rank/criterion/sum_of_typos.rs | 80 +++++++++++++++++++++---- src/rank/criterion/words_proximity.rs | 22 +++---- 6 files changed, 208 insertions(+), 51 deletions(-) diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index ee4ec9d0a..ce466a85a 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -164,9 +164,21 @@ mod tests { #[test] fn builder_serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; - let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; - let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; + let a = DocIndex { + document_id: DocumentId(0), + attribute: Attribute::new_faillible(3, 11), + word_area: WordArea::new_faillible(30, 4) + }; + let b = DocIndex { + document_id: DocumentId(1), + attribute: Attribute::new_faillible(4, 21), + word_area: WordArea::new_faillible(35, 6) + }; + let c = DocIndex { + document_id: DocumentId(2), + attribute: Attribute::new_faillible(8, 2), + word_area: WordArea::new_faillible(89, 6) + }; let mut builder = DocIndexesBuilder::memory(); @@ -187,9 +199,21 @@ mod tests { #[test] fn serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; - let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; - let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; + let a = DocIndex { + document_id: DocumentId(0), + attribute: Attribute::new_faillible(3, 11), + word_area: WordArea::new_faillible(30, 4) + }; + let b = DocIndex { + document_id: DocumentId(1), + attribute: Attribute::new_faillible(4, 21), + word_area: WordArea::new_faillible(35, 6) + }; + let c = DocIndex { + document_id: DocumentId(2), + attribute: Attribute::new_faillible(8, 2), + word_area: WordArea::new_faillible(89, 6) + }; let mut builder = DocIndexesBuilder::memory(); diff --git a/src/database/blob/positive/blob.rs b/src/database/blob/positive/blob.rs index bd1f32d6f..df2e8497a 100644 --- a/src/database/blob/positive/blob.rs +++ b/src/database/blob/positive/blob.rs @@ -209,9 +209,21 @@ mod tests { #[test] fn serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; - let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; - let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; + let a = DocIndex { + document_id: DocumentId(0), + attribute: Attribute::new_faillible(3, 11), + word_area: WordArea::new_faillible(30, 4) + }; + let b = DocIndex { + document_id: DocumentId(1), + attribute: Attribute::new_faillible(4, 21), + word_area: WordArea::new_faillible(35, 6) + }; + let c = DocIndex { + document_id: DocumentId(2), + attribute: Attribute::new_faillible(8, 2), + word_area: WordArea::new_faillible(89, 6) + }; let mut builder = PositiveBlobBuilder::memory(); @@ -232,9 +244,21 @@ mod tests { #[test] fn serde_serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; - let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; - let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; + let a = DocIndex { + document_id: DocumentId(0), + attribute: Attribute::new_faillible(3, 11), + word_area: WordArea::new_faillible(30, 4) + }; + let b = DocIndex { + document_id: DocumentId(1), + attribute: Attribute::new_faillible(4, 21), + word_area: WordArea::new_faillible(35, 6) + }; + let c = DocIndex { + document_id: DocumentId(2), + attribute: Attribute::new_faillible(8, 2), + word_area: WordArea::new_faillible(89, 6) + }; let mut builder = PositiveBlobBuilder::memory(); diff --git a/src/database/update/positive/update.rs b/src/database/update/positive/update.rs index de064e5a1..244ef9e9a 100644 --- a/src/database/update/positive/update.rs +++ b/src/database/update/positive/update.rs @@ -348,8 +348,8 @@ where B: TokenizerBuilder for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { let doc_index = DocIndex { document_id: self.document_id, - attribute: Attribute::new(self.attribute.0, word_index as u32), - word_area: WordArea::new(char_index as u32, word.len() as u16), + attribute: Attribute::new_faillible(self.attribute.0, word_index as u32), + word_area: WordArea::new_faillible(char_index as u32, word.len() as u16), }; // insert the exact representation diff --git a/src/lib.rs b/src/lib.rs index 2bb82a4b3..b43d8d506 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,21 @@ pub struct DocumentId(u64); pub struct Attribute(u32); impl Attribute { + /// Construct an `Attribute` from an attribute number and + /// the word position of a match according to the tokenizer used. + fn new(attribute: u16, index: u32) -> Result { + if attribute & 0b1111_1100_0000_0000 != 0 { + return Err(AttributeError::AttributeTooBig) + } + + if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + return Err(AttributeError::IndexTooBig) + } + + let attribute = (attribute as u32) << 22; + Ok(Attribute(attribute | index)) + } + /// Construct an `Attribute` from an attribute number and /// the word position of a match according to the tokenizer used. /// @@ -36,12 +51,16 @@ impl Attribute { /// /// The attribute must not be greater than 1024 /// and the word index not greater than 2^22. - fn new(attribute: u16, index: u32) -> Attribute { - assert!(attribute & 0b1111_1100_0000_0000 == 0); - assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0); - - let attribute = (attribute as u32) << 22; - Attribute(attribute | index) + fn new_faillible(attribute: u16, index: u32) -> Attribute { + match Attribute::new(attribute, index) { + Ok(attribute) => attribute, + Err(AttributeError::AttributeTooBig) => { + panic!("attribute must not be greater than 1024") + }, + Err(AttributeError::IndexTooBig) => { + panic!("attribute word index must not be greater than 2^22") + }, + } } pub fn attribute(&self) -> u16 { @@ -62,6 +81,11 @@ impl fmt::Debug for Attribute { } } +enum AttributeError { + AttributeTooBig, + IndexTooBig, +} + /// Represent a word position in bytes along with the length of it. /// /// It can represent words byte index to maximum 2^22 and @@ -77,12 +101,32 @@ impl WordArea { /// /// The byte index must not be greater than 2^22 /// and the length not greater than 1024. - fn new(byte_index: u32, length: u16) -> WordArea { + fn new(byte_index: u32, length: u16) -> Result { assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0); assert!(length & 0b1111_1100_0000_0000 == 0); + if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + return Err(WordAreaError::ByteIndexTooBig) + } + + if length & 0b1111_1100_0000_0000 != 0 { + return Err(WordAreaError::LengthTooBig) + } + let byte_index = byte_index << 10; - WordArea(byte_index | (length as u32)) + Ok(WordArea(byte_index | (length as u32))) + } + + fn new_faillible(byte_index: u32, length: u16) -> WordArea { + match WordArea::new(byte_index, length) { + Ok(word_area) => word_area, + Err(WordAreaError::ByteIndexTooBig) => { + panic!("word area byte index must not be greater than 2^22") + }, + Err(WordAreaError::LengthTooBig) => { + panic!("word area length must not be greater than 1024") + }, + } } pub fn byte_index(&self) -> u32 { @@ -103,6 +147,11 @@ impl fmt::Debug for WordArea { } } +enum WordAreaError { + ByteIndexTooBig, + LengthTooBig, +} + /// This structure represent the position of a word /// in a document and its attributes. /// @@ -166,9 +215,9 @@ impl Match { Match { query_index: 0, distance: 0, - attribute: Attribute::new(0, 0), + attribute: Attribute::new_faillible(0, 0), is_exact: false, - word_area: WordArea::new(0, 0), + word_area: WordArea::new_faillible(0, 0), } } @@ -200,7 +249,7 @@ mod tests { return TestResult::discard() } - let attribute = Attribute::new(gen_attr, gen_index); + let attribute = Attribute::new_faillible(gen_attr, gen_index); let valid_attribute = attribute.attribute() == gen_attr; let valid_index = attribute.word_index() == gen_index; @@ -213,8 +262,8 @@ mod tests { return TestResult::discard() } - let a = Attribute::new(gen_attr, gen_index); - let b = Attribute::new(gen_attr + 1, gen_index + 1); + let a = Attribute::new_faillible(gen_attr, gen_index); + let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); TestResult::from_bool(a < b) } @@ -224,7 +273,7 @@ mod tests { return TestResult::discard() } - let word_area = WordArea::new(gen_byte_index, gen_length); + let word_area = WordArea::new_faillible(gen_byte_index, gen_length); let valid_char_index = word_area.byte_index() == gen_byte_index; let valid_length = word_area.length() == gen_length; @@ -237,8 +286,8 @@ mod tests { return TestResult::discard() } - let a = WordArea::new(gen_byte_index, gen_length); - let b = WordArea::new(gen_byte_index + 1, gen_length + 1); + let a = WordArea::new_faillible(gen_byte_index, gen_length); + let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1); TestResult::from_bool(a < b) } diff --git a/src/rank/criterion/sum_of_typos.rs b/src/rank/criterion/sum_of_typos.rs index 3015a6b4b..409c37cb4 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/src/rank/criterion/sum_of_typos.rs @@ -54,8 +54,20 @@ mod tests { fn one_typo_reference() { let doc0 = { let matches = vec![ - Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, - Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 0, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, + Match { + query_index: 1, + distance: 0, + attribute: Attribute::new_faillible(0, 2), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(0), @@ -65,8 +77,20 @@ mod tests { let doc1 = { let matches = vec![ - Match { query_index: 0, distance: 1, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, - Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 1, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, + Match { + query_index: 1, + distance: 0, + attribute: Attribute::new_faillible(0, 2), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(1), @@ -87,8 +111,20 @@ mod tests { fn no_typo() { let doc0 = { let matches = vec![ - Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, - Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 0, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, + Match { + query_index: 1, + distance: 0, + attribute: Attribute::new_faillible(0, 1), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(0), @@ -98,7 +134,13 @@ mod tests { let doc1 = { let matches = vec![ - Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 0, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(1), @@ -119,8 +161,20 @@ mod tests { fn one_typo() { let doc0 = { let matches = vec![ - Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, - Match { query_index: 1, distance: 1, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 0, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, + Match { + query_index: 1, + distance: 1, + attribute: Attribute::new_faillible(0, 1), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(0), @@ -130,7 +184,13 @@ mod tests { let doc1 = { let matches = vec![ - Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, + Match { + query_index: 0, + distance: 0, + attribute: Attribute::new_faillible(0, 0), + is_exact: false, + word_area: WordArea::new_faillible(0, 6) + }, ]; Document { id: DocumentId(1), diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index 5d7e96122..f4b3aa0cd 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -81,11 +81,11 @@ mod tests { // { id: 3, attr: 3, attr_index: 1 } let matches = &[ - Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new(1, 0), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new(2, 0), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new(3, 1), ..Match::zero() }, + Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, + Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, + Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, + Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() }, + Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() }, ]; // soup -> of = 8 @@ -107,12 +107,12 @@ mod tests { // { id: 3, attr: 1, attr_index: 3 } let matches = &[ - Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() }, - Match { query_index: 0, attribute: Attribute::new(1, 0), ..Match::zero() }, - Match { query_index: 1, attribute: Attribute::new(1, 1), ..Match::zero() }, - Match { query_index: 2, attribute: Attribute::new(1, 2), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new(0, 1), ..Match::zero() }, - Match { query_index: 3, attribute: Attribute::new(1, 3), ..Match::zero() }, + Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() }, + Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() }, + Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() }, + Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() }, + Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() }, + Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() }, ]; // soup -> of = 1