From a45cc4b6189c2eb40758b9749c3d1c8b19b88d40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Mar 2019 14:59:58 +0100 Subject: [PATCH 1/3] fix: Reduce the size of the DocIndex type --- examples/query-database.rs | 2 +- src/database/serde/indexer_serializer.rs | 6 +++--- src/lib.rs | 14 +++++++------- src/rank/criterion/sum_of_words_position.rs | 2 +- src/rank/criterion/words_proximity.rs | 14 +++++++------- src/rank/mod.rs | 8 ++++---- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/query-database.rs b/examples/query-database.rs index 64e035a8a..ca6733c30 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -126,7 +126,7 @@ fn crop_text( (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2) }) .map(|match_| { - Match { char_index: match_.char_index - start as u32, ..match_ } + Match { char_index: match_.char_index - start as u16, ..match_ } }) .collect(); diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index 75860d937..c25ffe98c 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -56,7 +56,7 @@ where B: TokenizerBuilder // FIXME must u32::try_from instead let attribute = self.attribute.0; - let word_index = word_index as u32; + let word_index = word_index as u16; // insert the exact representation let word_lower = word.to_lowercase(); @@ -69,7 +69,7 @@ where B: TokenizerBuilder let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = word_unidecoded.trim(); if word_lower != word_unidecoded { - let char_index = char_index as u32; + let char_index = char_index as u16; let char_length = length; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; @@ -77,7 +77,7 @@ where B: TokenizerBuilder } } - let char_index = char_index as u32; + let char_index = char_index as u16; let char_length = length; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; diff --git a/src/lib.rs b/src/lib.rs index e77e03ecb..964de8f75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -50,14 +50,14 @@ pub struct DocIndex { /// The attribute in the document where the word was found /// along with the index in it. pub attribute: u16, - pub word_index: u32, + pub word_index: u16, /// The position in bytes where the word was found /// along with the length of it. /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub char_index: u32, + pub char_index: u16, pub char_length: u16, } @@ -84,7 +84,7 @@ pub struct Match { /// The attribute in the document where the word was found /// along with the index in it. pub attribute: u16, - pub word_index: u32, + pub word_index: u16, /// Whether the word that match is an exact match or a prefix. pub is_exact: bool, @@ -94,7 +94,7 @@ pub struct Match { /// /// It informs on the original word area in the text indexed /// without needing to run the tokenizer again. - pub char_index: u32, + pub char_index: u16, pub char_length: u16, } @@ -116,9 +116,9 @@ impl Match { query_index: u32::max_value(), distance: u8::max_value(), attribute: u16::max_value(), - word_index: u32::max_value(), + word_index: u16::max_value(), is_exact: true, - char_index: u32::max_value(), + char_index: u16::max_value(), char_length: u16::max_value(), } } @@ -131,6 +131,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 24); + assert_eq!(mem::size_of::(), 16); } } diff --git a/src/rank/criterion/sum_of_words_position.rs b/src/rank/criterion/sum_of_words_position.rs index 73ea5978c..5938ce5ab 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/src/rank/criterion/sum_of_words_position.rs @@ -6,7 +6,7 @@ use crate::rank::criterion::Criterion; use crate::rank::RawDocument; #[inline] -fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize { +fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { let mut sum_word_index = 0; let mut index = 0; diff --git a/src/rank/criterion/words_proximity.rs b/src/rank/criterion/words_proximity.rs index 42cc738ce..dbf26e21a 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/src/rank/criterion/words_proximity.rs @@ -5,14 +5,14 @@ use slice_group_by::GroupBy; use crate::rank::criterion::Criterion; use crate::rank::RawDocument; -const MAX_DISTANCE: u32 = 8; +const MAX_DISTANCE: u16 = 8; #[inline] fn clone_tuple((a, b): (&T, &U)) -> (T, U) { (a.clone(), b.clone()) } -fn index_proximity(lhs: u32, rhs: u32) -> u32 { +fn index_proximity(lhs: u16, rhs: u16) -> u16 { if lhs < rhs { cmp::min(rhs - lhs, MAX_DISTANCE) } else { @@ -20,13 +20,13 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { } } -fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 { +fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { if lattr != rattr { return MAX_DISTANCE } index_proximity(lwi, rwi) } -fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 { - let mut min_prox = u32::max_value(); +fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { + let mut min_prox = u16::max_value(); for a in lattr.iter().zip(lwi) { for b in rattr.iter().zip(rwi) { @@ -43,8 +43,8 @@ fn matches_proximity( query_index: &[u32], distance: &[u8], attribute: &[u16], - word_index: &[u32], -) -> u32 + word_index: &[u16], +) -> u16 { let mut query_index_groups = query_index.linear_group(); let mut proximity = 0; diff --git a/src/rank/mod.rs b/src/rank/mod.rs index 3b31c0794..f5b07d27d 100644 --- a/src/rank/mod.rs +++ b/src/rank/mod.rs @@ -79,7 +79,7 @@ impl RawDocument { unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } } - pub fn word_index(&self) -> &[u32] { + pub fn word_index(&self) -> &[u16] { let r = self.matches.range; // it is safe because construction/modifications // can only be done in this module @@ -93,7 +93,7 @@ impl RawDocument { unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } } - pub fn char_index(&self) -> &[u32] { + pub fn char_index(&self) -> &[u16] { let r = self.matches.range; // it is safe because construction/modifications // can only be done in this module @@ -150,9 +150,9 @@ struct Matches { query_index: Vec, distance: Vec, attribute: Vec, - word_index: Vec, + word_index: Vec, is_exact: Vec, - char_index: Vec, + char_index: Vec, char_length: Vec, } From 383a49b44f943ade59ce92dd2d0578d930a9de9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Mar 2019 15:00:53 +0100 Subject: [PATCH 2/3] fix: Compact the whole database for each WriteBatch injected --- src/database/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/database/mod.rs b/src/database/mod.rs index 2b7a87f45..d8b1bea79 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -266,6 +266,7 @@ impl DatabaseIndex { fn commit_update(&self, update: Update) -> Result>>, Box> { let batch = update.build()?; self.db.write(batch)?; + self.db.compact_range(None, None); let snapshot = Snapshot::new(self.db.clone()); let view = Arc::new(DatabaseView::new(snapshot)?); From aae301878c6bf1dbb04e9535395147d0edd1afad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 4 Mar 2019 15:01:38 +0100 Subject: [PATCH 3/3] fix: Flush the database after each WriteBatch injected --- src/database/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/database/mod.rs b/src/database/mod.rs index d8b1bea79..70ca62d92 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -267,6 +267,7 @@ impl DatabaseIndex { let batch = update.build()?; self.db.write(batch)?; self.db.compact_range(None, None); + self.db.flush(true)?; let snapshot = Snapshot::new(self.db.clone()); let view = Arc::new(DatabaseView::new(snapshot)?);