diff --git a/examples/query-database.rs b/examples/query-database.rs index e61e2d0ab..0a8771a51 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -48,6 +48,24 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { Ok(()) } +fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) { + let mut byte_index = 0; + let mut byte_length = 0; + + for (n, (i, c)) in text.char_indices().enumerate() { + if n == index { + byte_index = i; + } + + if n + 1 == index + length { + byte_length = i - byte_index + c.len_utf8(); + break; + } + } + + (byte_index, byte_length) +} + fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec { let mut byte_indexes = BTreeMap::new(); @@ -55,11 +73,18 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) let match_attribute = match_.attribute.attribute(); if SchemaAttr::new(match_attribute) == attribute { let word_area = match_.word_area; - let byte_index = word_area.byte_index() as usize; - let length = word_area.length() as usize; + + let char_index = word_area.char_index() as usize; + let char_length = word_area.length() as usize; + let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); + match byte_indexes.entry(byte_index) { - Entry::Vacant(entry) => { entry.insert(length); }, - Entry::Occupied(mut entry) => if *entry.get() < length { entry.insert(length); }, + Entry::Vacant(entry) => { entry.insert(byte_length); }, + Entry::Occupied(mut entry) => { + if *entry.get() < byte_length { + entry.insert(byte_length); + } + }, } } } diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index 7bbcca7e7..ae3eba436 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -51,24 +51,14 @@ where B: TokenizerBuilder fn serialize_str(self, v: &str) -> Result { for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { + let document_id = self.document_id; + // FIXME must u32::try_from instead let attribute = match Attribute::new(self.attribute.0, word_index as u32) { Ok(attribute) => attribute, Err(_) => return Ok(()), }; - // FIXME must u16/u32::try_from instead - let word_area = match WordArea::new(char_index as u32, word.len() as u16) { - Ok(word_area) => word_area, - Err(_) => return Ok(()), - }; - - let doc_index = DocIndex { - document_id: self.document_id, - attribute, - word_area - }; - // insert the exact representation let word_lower = word.to_lowercase(); @@ -77,9 +67,26 @@ where B: TokenizerBuilder // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { + + // FIXME must u16/u32::try_from instead + let length = word_unidecoded.chars().count() as u16; + let word_area = match WordArea::new(char_index as u32, length) { + Ok(word_area) => word_area, + Err(_) => return Ok(()), + }; + + let doc_index = DocIndex { document_id, attribute, word_area }; self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } + // FIXME must u16/u32::try_from instead + let length = word.chars().count() as u16; + let word_area = match WordArea::new(char_index as u32, length) { + Ok(word_area) => word_area, + Err(_) => return Ok(()), + }; + + let doc_index = DocIndex { document_id, attribute, word_area }; self.update.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 2a241a2ac..03c2a200d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -97,15 +97,15 @@ enum AttributeError { pub struct WordArea(u32); impl WordArea { - /// Construct a `WordArea` from a word position in bytes - /// and the length of it. + /// Construct a `WordArea` from a word position in expresed as + /// a number of characters and the length of it. /// /// # Panics /// - /// The byte index must not be greater than 2^22 + /// The char index must not be greater than 2^22 /// and the length not greater than 1024. - fn new(byte_index: u32, length: u16) -> Result { - if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + fn new(char_index: u32, length: u16) -> Result { + if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { return Err(WordAreaError::ByteIndexTooBig) } @@ -113,12 +113,12 @@ impl WordArea { return Err(WordAreaError::LengthTooBig) } - let byte_index = byte_index << 10; - Ok(WordArea(byte_index | u32::from(length))) + let char_index = char_index << 10; + Ok(WordArea(char_index | u32::from(length))) } - fn new_faillible(byte_index: u32, length: u16) -> WordArea { - match WordArea::new(byte_index, length) { + fn new_faillible(char_index: u32, length: u16) -> WordArea { + match WordArea::new(char_index, length) { Ok(word_area) => word_area, Err(WordAreaError::ByteIndexTooBig) => { panic!("word area byte index must not be greater than 2^22") @@ -130,7 +130,7 @@ impl WordArea { } #[inline] - pub fn byte_index(self) -> u32 { + pub fn char_index(self) -> u32 { self.0 >> 10 } @@ -143,7 +143,7 @@ impl WordArea { impl fmt::Debug for WordArea { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("WordArea") - .field("byte_index", &self.byte_index()) + .field("char_index", &self.char_index()) .field("length", &self.length()) .finish() } @@ -270,26 +270,26 @@ mod tests { TestResult::from_bool(a < b) } - fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult { - if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { + fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { return TestResult::discard() } - let word_area = WordArea::new_faillible(gen_byte_index, gen_length); + let word_area = WordArea::new_faillible(gen_char_index, gen_length); - let valid_char_index = word_area.byte_index() == gen_byte_index; + let valid_char_index = word_area.char_index() == gen_char_index; let valid_length = word_area.length() == gen_length; TestResult::from_bool(valid_char_index && valid_length) } - fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult { - if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { + fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { return TestResult::discard() } - let a = WordArea::new_faillible(gen_byte_index, gen_length); - let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1); + let a = WordArea::new_faillible(gen_char_index, gen_length); + let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); TestResult::from_bool(a < b) } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 79794f6d8..a2910728d 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -96,7 +96,7 @@ impl<'a> Iterator for Tokenizer<'a> { let (spaces, word) = prefix.split_at(start_word); self.inner = tail; - self.char_index += spaces.len(); + self.char_index += spaces.chars().count(); self.word_index += distance.map(Separator::to_usize).unwrap_or(0); let token = Token { @@ -105,7 +105,7 @@ impl<'a> Iterator for Tokenizer<'a> { char_index: self.char_index, }; - self.char_index += word.len(); + self.char_index += word.chars().count(); return Some(token) } @@ -122,7 +122,7 @@ impl<'a> Iterator for Tokenizer<'a> { let token = Token { word: word, word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), - char_index: self.char_index + spaces.len(), + char_index: self.char_index + spaces.chars().count(), }; return Some(token) } @@ -173,7 +173,7 @@ mod tests { assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); @@ -181,8 +181,8 @@ mod tests { assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 19 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 25 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 })); assert_eq!(tokenizer.next(), None); } }