feat: Make WordArea be based on char index and length

This commit is contained in:
Clément Renault 2019-01-09 20:14:08 +01:00
parent 86bfb173ef
commit b53ef08d05
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
4 changed files with 73 additions and 41 deletions

View file

@ -51,24 +51,14 @@ where B: TokenizerBuilder
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute,
Err(_) => return Ok(()),
};
// FIXME must u16/u32::try_from instead
let word_area = match WordArea::new(char_index as u32, word.len() as u16) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex {
document_id: self.document_id,
attribute,
word_area
};
// insert the exact representation
let word_lower = word.to_lowercase();
@ -77,9 +67,26 @@ where B: TokenizerBuilder
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
// FIXME must u16/u32::try_from instead
let length = word_unidecoded.chars().count() as u16;
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
}
// FIXME must u16/u32::try_from instead
let length = word.chars().count() as u16;
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
}
Ok(())