Merge pull request #74 from Kerollmops/same-document-update-shadowed

Make multiple document updates shadow themselves
This commit is contained in:
Clément Renault 2019-01-10 15:57:49 +01:00 committed by GitHub
commit 3d820a27ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 9 deletions

View File

@ -49,8 +49,8 @@ where B: TokenizerBuilder
} }
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id; let document_id = self.document_id;
// FIXME must u32::try_from instead // FIXME must u32::try_from instead
@ -61,15 +61,13 @@ where B: TokenizerBuilder
// insert the exact representation // insert the exact representation
let word_lower = word.to_lowercase(); let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue } if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version // and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded { if word_lower != word_unidecoded {
// FIXME must u16/u32::try_from instead
let length = word_unidecoded.chars().count() as u16;
let word_area = match WordArea::new(char_index as u32, length) { let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area, Ok(word_area) => word_area,
Err(_) => return Ok(()), Err(_) => return Ok(()),
@ -79,8 +77,6 @@ where B: TokenizerBuilder
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
} }
// FIXME must u16/u32::try_from instead
let length = word.chars().count() as u16;
let word_area = match WordArea::new(char_index as u32, length) { let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area, Ok(word_area) => word_area,
Err(_) => return Ok(()), Err(_) => return Ok(()),

View File

@ -1,4 +1,4 @@
use std::collections::BTreeMap; use std::collections::btree_map::{BTreeMap, Entry};
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
@ -39,6 +39,10 @@ impl DocumentUpdate {
pub fn remove(&mut self) { pub fn remove(&mut self) {
self.cleared = true; self.cleared = true;
self.clear();
}
pub fn clear(&mut self) {
self.words_indexes.clear(); self.words_indexes.clear();
self.attributes.clear(); self.attributes.clear();
} }
@ -61,7 +65,13 @@ impl RawUpdateBuilder {
} }
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate { pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
self.document_updates.entry(document_id).or_insert_with(DocumentUpdate::new) match self.document_updates.entry(document_id) {
Entry::Occupied(mut occupied) => {
occupied.get_mut().clear();
occupied.into_mut()
},
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
}
} }
pub fn build(mut self) -> Result<Update, Box<Error>> { pub fn build(mut self) -> Result<Update, Box<Error>> {