Merge pull request #125 from Kerollmops/limit-memory-usage

Limit memory usage
This commit is contained in:
Clément Renault 2019-03-05 16:17:56 +01:00 committed by GitHub
commit 915f2e70a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 25 additions and 23 deletions

View File

@ -126,7 +126,7 @@ fn crop_text(
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2) (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
}) })
.map(|match_| { .map(|match_| {
Match { char_index: match_.char_index - start as u32, ..match_ } Match { char_index: match_.char_index - start as u16, ..match_ }
}) })
.collect(); .collect();

View File

@ -266,6 +266,8 @@ impl DatabaseIndex {
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> { fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let batch = update.build()?; let batch = update.build()?;
self.db.write(batch)?; self.db.write(batch)?;
self.db.compact_range(None, None);
self.db.flush(true)?;
let snapshot = Snapshot::new(self.db.clone()); let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?); let view = Arc::new(DatabaseView::new(snapshot)?);

View File

@ -56,7 +56,7 @@ where B: TokenizerBuilder
// FIXME must u32::try_from instead // FIXME must u32::try_from instead
let attribute = self.attribute.0; let attribute = self.attribute.0;
let word_index = word_index as u32; let word_index = word_index as u16;
// insert the exact representation // insert the exact representation
let word_lower = word.to_lowercase(); let word_lower = word.to_lowercase();
@ -69,7 +69,7 @@ where B: TokenizerBuilder
let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = unidecode::unidecode(word).to_lowercase();
let word_unidecoded = word_unidecoded.trim(); let word_unidecoded = word_unidecoded.trim();
if word_lower != word_unidecoded { if word_lower != word_unidecoded {
let char_index = char_index as u32; let char_index = char_index as u16;
let char_length = length; let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
@ -77,7 +77,7 @@ where B: TokenizerBuilder
} }
} }
let char_index = char_index as u32; let char_index = char_index as u16;
let char_length = length; let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };

View File

@ -50,14 +50,14 @@ pub struct DocIndex {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: u16, pub attribute: u16,
pub word_index: u32, pub word_index: u16,
/// The position in bytes where the word was found /// The position in bytes where the word was found
/// along with the length of it. /// along with the length of it.
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub char_index: u32, pub char_index: u16,
pub char_length: u16, pub char_length: u16,
} }
@ -84,7 +84,7 @@ pub struct Match {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: u16, pub attribute: u16,
pub word_index: u32, pub word_index: u16,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.
pub is_exact: bool, pub is_exact: bool,
@ -94,7 +94,7 @@ pub struct Match {
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub char_index: u32, pub char_index: u16,
pub char_length: u16, pub char_length: u16,
} }
@ -116,9 +116,9 @@ impl Match {
query_index: u32::max_value(), query_index: u32::max_value(),
distance: u8::max_value(), distance: u8::max_value(),
attribute: u16::max_value(), attribute: u16::max_value(),
word_index: u32::max_value(), word_index: u16::max_value(),
is_exact: true, is_exact: true,
char_index: u32::max_value(), char_index: u16::max_value(),
char_length: u16::max_value(), char_length: u16::max_value(),
} }
} }
@ -131,6 +131,6 @@ mod tests {
#[test] #[test]
fn docindex_mem_size() { fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 24); assert_eq!(mem::size_of::<DocIndex>(), 16);
} }
} }

View File

@ -6,7 +6,7 @@ use crate::rank::criterion::Criterion;
use crate::rank::RawDocument; use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize { fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0; let mut sum_word_index = 0;
let mut index = 0; let mut index = 0;

View File

@ -5,14 +5,14 @@ use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::rank::RawDocument; use crate::rank::RawDocument;
const MAX_DISTANCE: u32 = 8; const MAX_DISTANCE: u16 = 8;
#[inline] #[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) { fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone()) (a.clone(), b.clone())
} }
fn index_proximity(lhs: u32, rhs: u32) -> u32 { fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs { if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE) cmp::min(rhs - lhs, MAX_DISTANCE)
} else { } else {
@ -20,13 +20,13 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
} }
} }
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 { fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE } if lattr != rattr { return MAX_DISTANCE }
index_proximity(lwi, rwi) index_proximity(lwi, rwi)
} }
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 { fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u32::max_value(); let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) { for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) { for b in rattr.iter().zip(rwi) {
@ -43,8 +43,8 @@ fn matches_proximity(
query_index: &[u32], query_index: &[u32],
distance: &[u8], distance: &[u8],
attribute: &[u16], attribute: &[u16],
word_index: &[u32], word_index: &[u16],
) -> u32 ) -> u16
{ {
let mut query_index_groups = query_index.linear_group(); let mut query_index_groups = query_index.linear_group();
let mut proximity = 0; let mut proximity = 0;

View File

@ -79,7 +79,7 @@ impl RawDocument {
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
} }
pub fn word_index(&self) -> &[u32] { pub fn word_index(&self) -> &[u16] {
let r = self.matches.range; let r = self.matches.range;
// it is safe because construction/modifications // it is safe because construction/modifications
// can only be done in this module // can only be done in this module
@ -93,7 +93,7 @@ impl RawDocument {
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
} }
pub fn char_index(&self) -> &[u32] { pub fn char_index(&self) -> &[u16] {
let r = self.matches.range; let r = self.matches.range;
// it is safe because construction/modifications // it is safe because construction/modifications
// can only be done in this module // can only be done in this module
@ -150,9 +150,9 @@ struct Matches {
query_index: Vec<u32>, query_index: Vec<u32>,
distance: Vec<u8>, distance: Vec<u8>,
attribute: Vec<u16>, attribute: Vec<u16>,
word_index: Vec<u32>, word_index: Vec<u16>,
is_exact: Vec<bool>, is_exact: Vec<bool>,
char_index: Vec<u32>, char_index: Vec<u16>,
char_length: Vec<u16>, char_length: Vec<u16>,
} }