Merge pull request #51 from Kerollmops/wordarea-attribute-fallible

Make the Attribute and WordArea errors recoverable
This commit is contained in:
Clément Renault 2018-12-28 18:26:19 +01:00 committed by GitHub
commit 70772eca5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 208 additions and 51 deletions

View File

@ -164,9 +164,21 @@ mod tests {
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();
@ -187,9 +199,21 @@ mod tests {
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();

View File

@ -209,9 +209,21 @@ mod tests {
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = PositiveBlobBuilder::memory();
@ -232,9 +244,21 @@ mod tests {
#[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) };
let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) };
let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) };
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = PositiveBlobBuilder::memory();

View File

@ -348,8 +348,8 @@ where B: TokenizerBuilder
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: Attribute::new(self.attribute.0, word_index as u32),
word_area: WordArea::new(char_index as u32, word.len() as u16),
attribute: Attribute::new_faillible(self.attribute.0, word_index as u32),
word_area: WordArea::new_faillible(char_index as u32, word.len() as u16),
};
// insert the exact representation

View File

@ -29,6 +29,21 @@ pub struct DocumentId(u64);
pub struct Attribute(u32);
impl Attribute {
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
if attribute & 0b1111_1100_0000_0000 != 0 {
return Err(AttributeError::AttributeTooBig)
}
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(AttributeError::IndexTooBig)
}
let attribute = (attribute as u32) << 22;
Ok(Attribute(attribute | index))
}
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
///
@ -36,12 +51,16 @@ impl Attribute {
///
/// The attribute must not be greater than 1024
/// and the word index not greater than 2^22.
fn new(attribute: u16, index: u32) -> Attribute {
assert!(attribute & 0b1111_1100_0000_0000 == 0);
assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);
let attribute = (attribute as u32) << 22;
Attribute(attribute | index)
fn new_faillible(attribute: u16, index: u32) -> Attribute {
match Attribute::new(attribute, index) {
Ok(attribute) => attribute,
Err(AttributeError::AttributeTooBig) => {
panic!("attribute must not be greater than 1024")
},
Err(AttributeError::IndexTooBig) => {
panic!("attribute word index must not be greater than 2^22")
},
}
}
pub fn attribute(&self) -> u16 {
@ -62,6 +81,11 @@ impl fmt::Debug for Attribute {
}
}
enum AttributeError {
AttributeTooBig,
IndexTooBig,
}
/// Represent a word position in bytes along with the length of it.
///
/// It can represent words byte index to maximum 2^22 and
@ -77,12 +101,32 @@ impl WordArea {
///
/// The byte index must not be greater than 2^22
/// and the length not greater than 1024.
fn new(byte_index: u32, length: u16) -> WordArea {
fn new(byte_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);
assert!(length & 0b1111_1100_0000_0000 == 0);
if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(WordAreaError::ByteIndexTooBig)
}
if length & 0b1111_1100_0000_0000 != 0 {
return Err(WordAreaError::LengthTooBig)
}
let byte_index = byte_index << 10;
WordArea(byte_index | (length as u32))
Ok(WordArea(byte_index | (length as u32)))
}
fn new_faillible(byte_index: u32, length: u16) -> WordArea {
match WordArea::new(byte_index, length) {
Ok(word_area) => word_area,
Err(WordAreaError::ByteIndexTooBig) => {
panic!("word area byte index must not be greater than 2^22")
},
Err(WordAreaError::LengthTooBig) => {
panic!("word area length must not be greater than 1024")
},
}
}
pub fn byte_index(&self) -> u32 {
@ -103,6 +147,11 @@ impl fmt::Debug for WordArea {
}
}
enum WordAreaError {
ByteIndexTooBig,
LengthTooBig,
}
/// This structure represent the position of a word
/// in a document and its attributes.
///
@ -166,9 +215,9 @@ impl Match {
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new(0, 0),
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new(0, 0),
word_area: WordArea::new_faillible(0, 0),
}
}
@ -200,7 +249,7 @@ mod tests {
return TestResult::discard()
}
let attribute = Attribute::new(gen_attr, gen_index);
let attribute = Attribute::new_faillible(gen_attr, gen_index);
let valid_attribute = attribute.attribute() == gen_attr;
let valid_index = attribute.word_index() == gen_index;
@ -213,8 +262,8 @@ mod tests {
return TestResult::discard()
}
let a = Attribute::new(gen_attr, gen_index);
let b = Attribute::new(gen_attr + 1, gen_index + 1);
let a = Attribute::new_faillible(gen_attr, gen_index);
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
TestResult::from_bool(a < b)
}
@ -224,7 +273,7 @@ mod tests {
return TestResult::discard()
}
let word_area = WordArea::new(gen_byte_index, gen_length);
let word_area = WordArea::new_faillible(gen_byte_index, gen_length);
let valid_char_index = word_area.byte_index() == gen_byte_index;
let valid_length = word_area.length() == gen_length;
@ -237,8 +286,8 @@ mod tests {
return TestResult::discard()
}
let a = WordArea::new(gen_byte_index, gen_length);
let b = WordArea::new(gen_byte_index + 1, gen_length + 1);
let a = WordArea::new_faillible(gen_byte_index, gen_length);
let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1);
TestResult::from_bool(a < b)
}

View File

@ -54,8 +54,20 @@ mod tests {
fn one_typo_reference() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
@ -65,8 +77,20 @@ mod tests {
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 1, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 1,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
@ -87,8 +111,20 @@ mod tests {
fn no_typo() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
@ -98,7 +134,13 @@ mod tests {
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
@ -119,8 +161,20 @@ mod tests {
fn one_typo() {
let doc0 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match { query_index: 1, distance: 1, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 1,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
@ -130,7 +184,13 @@ mod tests {
let doc1 = {
let matches = vec![
Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) },
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),

View File

@ -81,11 +81,11 @@ mod tests {
// { id: 3, attr: 3, attr_index: 1 }
let matches = &[
Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new(1, 0), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new(3, 1), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
];
// soup -> of = 8
@ -107,12 +107,12 @@ mod tests {
// { id: 3, attr: 1, attr_index: 3 }
let matches = &[
Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new(1, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new(1, 3), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
];
// soup -> of = 1