mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-11 15:38:55 +01:00
feat: Make WordArea be based on char index and length
This commit is contained in:
parent
86bfb173ef
commit
b53ef08d05
@ -48,6 +48,24 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
||||||
|
let mut byte_index = 0;
|
||||||
|
let mut byte_length = 0;
|
||||||
|
|
||||||
|
for (n, (i, c)) in text.char_indices().enumerate() {
|
||||||
|
if n == index {
|
||||||
|
byte_index = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if n + 1 == index + length {
|
||||||
|
byte_length = i - byte_index + c.len_utf8();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(byte_index, byte_length)
|
||||||
|
}
|
||||||
|
|
||||||
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
|
||||||
let mut byte_indexes = BTreeMap::new();
|
let mut byte_indexes = BTreeMap::new();
|
||||||
|
|
||||||
@ -55,11 +73,18 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
|
|||||||
let match_attribute = match_.attribute.attribute();
|
let match_attribute = match_.attribute.attribute();
|
||||||
if SchemaAttr::new(match_attribute) == attribute {
|
if SchemaAttr::new(match_attribute) == attribute {
|
||||||
let word_area = match_.word_area;
|
let word_area = match_.word_area;
|
||||||
let byte_index = word_area.byte_index() as usize;
|
|
||||||
let length = word_area.length() as usize;
|
let char_index = word_area.char_index() as usize;
|
||||||
|
let char_length = word_area.length() as usize;
|
||||||
|
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||||
|
|
||||||
match byte_indexes.entry(byte_index) {
|
match byte_indexes.entry(byte_index) {
|
||||||
Entry::Vacant(entry) => { entry.insert(length); },
|
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||||
Entry::Occupied(mut entry) => if *entry.get() < length { entry.insert(length); },
|
Entry::Occupied(mut entry) => {
|
||||||
|
if *entry.get() < byte_length {
|
||||||
|
entry.insert(byte_length);
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,24 +51,14 @@ where B: TokenizerBuilder
|
|||||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||||
|
|
||||||
|
let document_id = self.document_id;
|
||||||
|
|
||||||
// FIXME must u32::try_from instead
|
// FIXME must u32::try_from instead
|
||||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||||
Ok(attribute) => attribute,
|
Ok(attribute) => attribute,
|
||||||
Err(_) => return Ok(()),
|
Err(_) => return Ok(()),
|
||||||
};
|
};
|
||||||
|
|
||||||
// FIXME must u16/u32::try_from instead
|
|
||||||
let word_area = match WordArea::new(char_index as u32, word.len() as u16) {
|
|
||||||
Ok(word_area) => word_area,
|
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc_index = DocIndex {
|
|
||||||
document_id: self.document_id,
|
|
||||||
attribute,
|
|
||||||
word_area
|
|
||||||
};
|
|
||||||
|
|
||||||
// insert the exact representation
|
// insert the exact representation
|
||||||
let word_lower = word.to_lowercase();
|
let word_lower = word.to_lowercase();
|
||||||
|
|
||||||
@ -77,9 +67,26 @@ where B: TokenizerBuilder
|
|||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
|
|
||||||
|
// FIXME must u16/u32::try_from instead
|
||||||
|
let length = word_unidecoded.chars().count() as u16;
|
||||||
|
let word_area = match WordArea::new(char_index as u32, length) {
|
||||||
|
Ok(word_area) => word_area,
|
||||||
|
Err(_) => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME must u16/u32::try_from instead
|
||||||
|
let length = word.chars().count() as u16;
|
||||||
|
let word_area = match WordArea::new(char_index as u32, length) {
|
||||||
|
Ok(word_area) => word_area,
|
||||||
|
Err(_) => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
let doc_index = DocIndex { document_id, attribute, word_area };
|
||||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
38
src/lib.rs
38
src/lib.rs
@ -97,15 +97,15 @@ enum AttributeError {
|
|||||||
pub struct WordArea(u32);
|
pub struct WordArea(u32);
|
||||||
|
|
||||||
impl WordArea {
|
impl WordArea {
|
||||||
/// Construct a `WordArea` from a word position in bytes
|
/// Construct a `WordArea` from a word position in expresed as
|
||||||
/// and the length of it.
|
/// a number of characters and the length of it.
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
///
|
///
|
||||||
/// The byte index must not be greater than 2^22
|
/// The char index must not be greater than 2^22
|
||||||
/// and the length not greater than 1024.
|
/// and the length not greater than 1024.
|
||||||
fn new(byte_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
|
fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
|
||||||
if byte_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
|
||||||
return Err(WordAreaError::ByteIndexTooBig)
|
return Err(WordAreaError::ByteIndexTooBig)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,12 +113,12 @@ impl WordArea {
|
|||||||
return Err(WordAreaError::LengthTooBig)
|
return Err(WordAreaError::LengthTooBig)
|
||||||
}
|
}
|
||||||
|
|
||||||
let byte_index = byte_index << 10;
|
let char_index = char_index << 10;
|
||||||
Ok(WordArea(byte_index | u32::from(length)))
|
Ok(WordArea(char_index | u32::from(length)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new_faillible(byte_index: u32, length: u16) -> WordArea {
|
fn new_faillible(char_index: u32, length: u16) -> WordArea {
|
||||||
match WordArea::new(byte_index, length) {
|
match WordArea::new(char_index, length) {
|
||||||
Ok(word_area) => word_area,
|
Ok(word_area) => word_area,
|
||||||
Err(WordAreaError::ByteIndexTooBig) => {
|
Err(WordAreaError::ByteIndexTooBig) => {
|
||||||
panic!("word area byte index must not be greater than 2^22")
|
panic!("word area byte index must not be greater than 2^22")
|
||||||
@ -130,7 +130,7 @@ impl WordArea {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn byte_index(self) -> u32 {
|
pub fn char_index(self) -> u32 {
|
||||||
self.0 >> 10
|
self.0 >> 10
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,7 +143,7 @@ impl WordArea {
|
|||||||
impl fmt::Debug for WordArea {
|
impl fmt::Debug for WordArea {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
f.debug_struct("WordArea")
|
f.debug_struct("WordArea")
|
||||||
.field("byte_index", &self.byte_index())
|
.field("char_index", &self.char_index())
|
||||||
.field("length", &self.length())
|
.field("length", &self.length())
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
@ -270,26 +270,26 @@ mod tests {
|
|||||||
TestResult::from_bool(a < b)
|
TestResult::from_bool(a < b)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult {
|
fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||||
if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
|
if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
|
||||||
return TestResult::discard()
|
return TestResult::discard()
|
||||||
}
|
}
|
||||||
|
|
||||||
let word_area = WordArea::new_faillible(gen_byte_index, gen_length);
|
let word_area = WordArea::new_faillible(gen_char_index, gen_length);
|
||||||
|
|
||||||
let valid_char_index = word_area.byte_index() == gen_byte_index;
|
let valid_char_index = word_area.char_index() == gen_char_index;
|
||||||
let valid_length = word_area.length() == gen_length;
|
let valid_length = word_area.length() == gen_length;
|
||||||
|
|
||||||
TestResult::from_bool(valid_char_index && valid_length)
|
TestResult::from_bool(valid_char_index && valid_length)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult {
|
fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
|
||||||
if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
|
if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
|
||||||
return TestResult::discard()
|
return TestResult::discard()
|
||||||
}
|
}
|
||||||
|
|
||||||
let a = WordArea::new_faillible(gen_byte_index, gen_length);
|
let a = WordArea::new_faillible(gen_char_index, gen_length);
|
||||||
let b = WordArea::new_faillible(gen_byte_index + 1, gen_length + 1);
|
let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
|
||||||
|
|
||||||
TestResult::from_bool(a < b)
|
TestResult::from_bool(a < b)
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
let (spaces, word) = prefix.split_at(start_word);
|
let (spaces, word) = prefix.split_at(start_word);
|
||||||
|
|
||||||
self.inner = tail;
|
self.inner = tail;
|
||||||
self.char_index += spaces.len();
|
self.char_index += spaces.chars().count();
|
||||||
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
|
||||||
|
|
||||||
let token = Token {
|
let token = Token {
|
||||||
@ -105,7 +105,7 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
char_index: self.char_index,
|
char_index: self.char_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
self.char_index += word.len();
|
self.char_index += word.chars().count();
|
||||||
return Some(token)
|
return Some(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,7 +122,7 @@ impl<'a> Iterator for Tokenizer<'a> {
|
|||||||
let token = Token {
|
let token = Token {
|
||||||
word: word,
|
word: word,
|
||||||
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
|
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
|
||||||
char_index: self.char_index + spaces.len(),
|
char_index: self.char_index + spaces.chars().count(),
|
||||||
};
|
};
|
||||||
return Some(token)
|
return Some(token)
|
||||||
}
|
}
|
||||||
@ -173,7 +173,7 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||||
@ -181,8 +181,8 @@ mod tests {
|
|||||||
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 19 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
|
||||||
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 25 }));
|
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
|
||||||
assert_eq!(tokenizer.next(), None);
|
assert_eq!(tokenizer.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user