From 86a0097311803394fc6a5bc01b820b617deaf5bc Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 15 Oct 2024 12:04:20 +0200 Subject: [PATCH] Use bumpalo in word docids --- .../extract/searchable/extract_word_docids.rs | 58 +++++++++++-------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index fd74cc8ce..66f7ae5e8 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,9 +1,11 @@ use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; +use std::mem::size_of; use std::num::NonZero; use std::ops::DerefMut as _; +use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; use grenad::{Merger, MergerBuilder}; use heed::RoTxn; @@ -113,30 +115,33 @@ impl WordDocidsCachedSorters { word: &str, exact: bool, docid: u32, - buffer: &mut Vec, + bump: &Bump, ) -> Result<()> { - let key = word.as_bytes(); + let word_bytes = word.as_bytes(); if exact { - self.exact_word_docids.insert_add_u32(key, docid)?; + self.exact_word_docids.insert_add_u32(word_bytes, docid)?; } else { - self.word_docids.insert_add_u32(key, docid)?; + self.word_docids.insert_add_u32(word_bytes, docid)?; } + let buffer_size = word_bytes.len() + 1 + size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); + buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&field_id.to_be_bytes()); - self.word_fid_docids.insert_add_u32(buffer, docid)?; + self.word_fid_docids.insert_add_u32(&buffer, docid)?; let position = bucketed_position(position); buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_position_docids.insert_add_u32(buffer, docid)?; + self.word_position_docids.insert_add_u32(&buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { - self.flush_fid_word_count(buffer)?; + self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count @@ -155,30 +160,33 @@ impl WordDocidsCachedSorters { word: &str, exact: bool, docid: u32, - buffer: &mut Vec, + bump: &Bump, ) -> Result<()> { - let key = word.as_bytes(); + let word_bytes = word.as_bytes(); if exact { - self.exact_word_docids.insert_del_u32(key, docid)?; + self.exact_word_docids.insert_del_u32(word_bytes, docid)?; } else { - self.word_docids.insert_del_u32(key, docid)?; + self.word_docids.insert_del_u32(word_bytes, docid)?; } + let buffer_size = word_bytes.len() + 1 + size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, bump); + buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&field_id.to_be_bytes()); - self.word_fid_docids.insert_del_u32(buffer, docid)?; + self.word_fid_docids.insert_del_u32(&buffer, docid)?; let position = bucketed_position(position); buffer.clear(); - buffer.extend_from_slice(word.as_bytes()); + buffer.extend_from_slice(word_bytes); buffer.push(0); buffer.extend_from_slice(&position.to_be_bytes()); - self.word_position_docids.insert_del_u32(buffer, docid)?; + self.word_position_docids.insert_del_u32(&buffer, docid)?; if self.current_docid.map_or(false, |id| docid != id) { - self.flush_fid_word_count(buffer)?; + self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count @@ -190,7 +198,7 @@ impl WordDocidsCachedSorters { Ok(()) } - fn flush_fid_word_count(&mut self, buffer: &mut Vec) -> Result<()> { + fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { if current_count <= MAX_COUNTED_WORDS { @@ -415,11 +423,11 @@ impl WordDocidsExtractors { let cached_sorter = cached_sorter.deref_mut(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut(); let new_fields_ids_map = new_fields_ids_map.deref_mut(); + let doc_alloc = &context.doc_alloc; let exact_attributes = index.exact_attributes(rtxn)?; let is_exact_attribute = |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); - let mut buffer = Vec::new(); match document_change { DocumentChange::Deletion(inner) => { let mut token_fn = |fname: &str, fid, pos, word: &str| { @@ -430,7 +438,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -449,7 +457,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -467,7 +475,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -486,7 +494,7 @@ impl WordDocidsExtractors { word, is_exact_attribute(fname), inner.docid(), - &mut buffer, + doc_alloc, ) .map_err(crate::Error::from) }; @@ -498,6 +506,8 @@ impl WordDocidsExtractors { } } + let buffer_size = size_of::(); + let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); cached_sorter.flush_fid_word_count(&mut buffer) }