From a3a28c56fad032cf832bc610b8051d0ffd7c76a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 2 Feb 2019 14:17:50 +0100 Subject: [PATCH] feat: Replace compressed Match fields by uncompressed ones --- examples/query-database.rs | 10 +++---- src/data/doc_indexes.rs | 36 ++++++++++++++++-------- src/database/serde/indexer_serializer.rs | 22 ++++++--------- src/lib.rs | 26 +++++++++++------ src/rank/query_builder.rs | 4 ++- 5 files changed, 56 insertions(+), 42 deletions(-) diff --git a/examples/query-database.rs b/examples/query-database.rs index 0a8771a51..d1e6a0e17 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) let mut byte_indexes = BTreeMap::new(); for match_ in matches { - let match_attribute = match_.attribute.attribute(); + let match_attribute = match_.attribute; if SchemaAttr::new(match_attribute) == attribute { - let word_area = match_.word_area; - - let char_index = word_area.char_index() as usize; - let char_length = word_area.length() as usize; + let char_index = match_.char_index as usize; + let char_length = match_.char_length as usize; let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); match byte_indexes.entry(byte_index) { @@ -151,7 +149,7 @@ fn main() -> Result<(), Box> { let mut matching_attributes = HashSet::new(); for _match in doc.matches { - let attr = SchemaAttr::new(_match.attribute.attribute()); + let attr = SchemaAttr::new(_match.attribute); let name = schema.attribute_name(attr); matching_attributes.insert(name); } diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index b760765bf..4919b9fa0 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -158,18 +158,24 @@ mod tests { fn builder_serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); @@ -193,18 +199,24 @@ mod tests { fn serialize_deserialize() -> Result<(), Box> { let a = DocIndex { document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) + attribute: 3, + word_index: 11, + char_index: 30, + char_length: 4, }; let b = DocIndex { document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) + attribute: 4, + word_index: 21, + char_index: 35, + char_length: 6, }; let c = DocIndex { document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) + attribute: 8, + word_index: 2, + char_index: 89, + char_length: 6, }; let mut builder = DocIndexesBuilder::memory(); diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index bdbfb281d..6271e1b7b 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -54,10 +54,8 @@ where B: TokenizerBuilder let document_id = self.document_id; // FIXME must u32::try_from instead - let attribute = match Attribute::new(self.attribute.0, word_index as u32) { - Ok(attribute) => attribute, - Err(_) => return Ok(()), - }; + let attribute = self.attribute.0; + let word_index = word_index as u32; // insert the exact representation let word_lower = word.to_lowercase(); @@ -68,21 +66,17 @@ where B: TokenizerBuilder // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } - let word_area = match WordArea::new(char_index as u32, length) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; + let char_index = char_index as u32; + let char_length = length; - let doc_index = DocIndex { document_id, attribute, word_area }; + let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; self.update.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 19e451f63..5f824b39a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,14 +36,16 @@ pub struct DocIndex { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } /// This structure represent a matching word with informations @@ -68,7 +70,8 @@ pub struct Match { /// The attribute in the document where the word was found /// along with the index in it. - pub attribute: Attribute, + pub attribute: u16, + pub word_index: u32, /// Whether the word that match is an exact match or a prefix. pub is_exact: bool, @@ -78,7 +81,8 @@ pub struct Match { /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub word_area: WordArea, + pub char_index: u32, + pub char_length: u16, } impl Match { @@ -86,9 +90,11 @@ impl Match { Match { query_index: 0, distance: 0, - attribute: Attribute::new_faillible(0, 0), + attribute: 0, + word_index: 0, is_exact: false, - word_area: WordArea::new_faillible(0, 0), + char_index: 0, + char_length: 0, } } @@ -96,9 +102,11 @@ impl Match { Match { query_index: u32::max_value(), distance: u8::max_value(), - attribute: Attribute::max_value(), + attribute: u16::max_value(), + word_index: u32::max_value(), is_exact: true, - word_area: WordArea::max_value(), + char_index: u32::max_value(), + char_length: u16::max_value(), } } } @@ -110,6 +118,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); + assert_eq!(mem::size_of::(), 24); } } diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 8146fc7fa..91d645160 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -111,8 +111,10 @@ where D: Deref, query_index: iv.index as u32, distance: distance, attribute: doc_index.attribute, + word_index: doc_index.word_index, is_exact: is_exact, - word_area: doc_index.word_area, + char_index: doc_index.char_index, + char_length: doc_index.char_length, }; matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); }