mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-15 17:38:56 +01:00
Use bumpalo in word docids
This commit is contained in:
parent
c75de1f391
commit
86a0097311
@ -1,9 +1,11 @@
|
|||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::mem::size_of;
|
||||||
use std::num::NonZero;
|
use std::num::NonZero;
|
||||||
use std::ops::DerefMut as _;
|
use std::ops::DerefMut as _;
|
||||||
|
|
||||||
|
use bumpalo::collections::vec::Vec as BumpVec;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use grenad::{Merger, MergerBuilder};
|
use grenad::{Merger, MergerBuilder};
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
@ -113,30 +115,33 @@ impl WordDocidsCachedSorters {
|
|||||||
word: &str,
|
word: &str,
|
||||||
exact: bool,
|
exact: bool,
|
||||||
docid: u32,
|
docid: u32,
|
||||||
buffer: &mut Vec<u8>,
|
bump: &Bump,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let key = word.as_bytes();
|
let word_bytes = word.as_bytes();
|
||||||
if exact {
|
if exact {
|
||||||
self.exact_word_docids.insert_add_u32(key, docid)?;
|
self.exact_word_docids.insert_add_u32(word_bytes, docid)?;
|
||||||
} else {
|
} else {
|
||||||
self.word_docids.insert_add_u32(key, docid)?;
|
self.word_docids.insert_add_u32(word_bytes, docid)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||||
|
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.extend_from_slice(word.as_bytes());
|
buffer.extend_from_slice(word_bytes);
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
self.word_fid_docids.insert_add_u32(buffer, docid)?;
|
self.word_fid_docids.insert_add_u32(&buffer, docid)?;
|
||||||
|
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.extend_from_slice(word.as_bytes());
|
buffer.extend_from_slice(word_bytes);
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
buffer.extend_from_slice(&position.to_be_bytes());
|
buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
self.word_position_docids.insert_add_u32(buffer, docid)?;
|
self.word_position_docids.insert_add_u32(&buffer, docid)?;
|
||||||
|
|
||||||
if self.current_docid.map_or(false, |id| docid != id) {
|
if self.current_docid.map_or(false, |id| docid != id) {
|
||||||
self.flush_fid_word_count(buffer)?;
|
self.flush_fid_word_count(&mut buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.fid_word_count
|
self.fid_word_count
|
||||||
@ -155,30 +160,33 @@ impl WordDocidsCachedSorters {
|
|||||||
word: &str,
|
word: &str,
|
||||||
exact: bool,
|
exact: bool,
|
||||||
docid: u32,
|
docid: u32,
|
||||||
buffer: &mut Vec<u8>,
|
bump: &Bump,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let key = word.as_bytes();
|
let word_bytes = word.as_bytes();
|
||||||
if exact {
|
if exact {
|
||||||
self.exact_word_docids.insert_del_u32(key, docid)?;
|
self.exact_word_docids.insert_del_u32(word_bytes, docid)?;
|
||||||
} else {
|
} else {
|
||||||
self.word_docids.insert_del_u32(key, docid)?;
|
self.word_docids.insert_del_u32(word_bytes, docid)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||||
|
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.extend_from_slice(word.as_bytes());
|
buffer.extend_from_slice(word_bytes);
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
self.word_fid_docids.insert_del_u32(buffer, docid)?;
|
self.word_fid_docids.insert_del_u32(&buffer, docid)?;
|
||||||
|
|
||||||
let position = bucketed_position(position);
|
let position = bucketed_position(position);
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.extend_from_slice(word.as_bytes());
|
buffer.extend_from_slice(word_bytes);
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
buffer.extend_from_slice(&position.to_be_bytes());
|
buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
self.word_position_docids.insert_del_u32(buffer, docid)?;
|
self.word_position_docids.insert_del_u32(&buffer, docid)?;
|
||||||
|
|
||||||
if self.current_docid.map_or(false, |id| docid != id) {
|
if self.current_docid.map_or(false, |id| docid != id) {
|
||||||
self.flush_fid_word_count(buffer)?;
|
self.flush_fid_word_count(&mut buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.fid_word_count
|
self.fid_word_count
|
||||||
@ -190,7 +198,7 @@ impl WordDocidsCachedSorters {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) -> Result<()> {
|
fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
|
||||||
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
||||||
if current_count != new_count {
|
if current_count != new_count {
|
||||||
if current_count <= MAX_COUNTED_WORDS {
|
if current_count <= MAX_COUNTED_WORDS {
|
||||||
@ -415,11 +423,11 @@ impl WordDocidsExtractors {
|
|||||||
let cached_sorter = cached_sorter.deref_mut();
|
let cached_sorter = cached_sorter.deref_mut();
|
||||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut();
|
||||||
let new_fields_ids_map = new_fields_ids_map.deref_mut();
|
let new_fields_ids_map = new_fields_ids_map.deref_mut();
|
||||||
|
let doc_alloc = &context.doc_alloc;
|
||||||
|
|
||||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||||
let is_exact_attribute =
|
let is_exact_attribute =
|
||||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||||
let mut buffer = Vec::new();
|
|
||||||
match document_change {
|
match document_change {
|
||||||
DocumentChange::Deletion(inner) => {
|
DocumentChange::Deletion(inner) => {
|
||||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||||
@ -430,7 +438,7 @@ impl WordDocidsExtractors {
|
|||||||
word,
|
word,
|
||||||
is_exact_attribute(fname),
|
is_exact_attribute(fname),
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
&mut buffer,
|
doc_alloc,
|
||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -449,7 +457,7 @@ impl WordDocidsExtractors {
|
|||||||
word,
|
word,
|
||||||
is_exact_attribute(fname),
|
is_exact_attribute(fname),
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
&mut buffer,
|
doc_alloc,
|
||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -467,7 +475,7 @@ impl WordDocidsExtractors {
|
|||||||
word,
|
word,
|
||||||
is_exact_attribute(fname),
|
is_exact_attribute(fname),
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
&mut buffer,
|
doc_alloc,
|
||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -486,7 +494,7 @@ impl WordDocidsExtractors {
|
|||||||
word,
|
word,
|
||||||
is_exact_attribute(fname),
|
is_exact_attribute(fname),
|
||||||
inner.docid(),
|
inner.docid(),
|
||||||
&mut buffer,
|
doc_alloc,
|
||||||
)
|
)
|
||||||
.map_err(crate::Error::from)
|
.map_err(crate::Error::from)
|
||||||
};
|
};
|
||||||
@ -498,6 +506,8 @@ impl WordDocidsExtractors {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let buffer_size = size_of::<FieldId>();
|
||||||
|
let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
|
||||||
cached_sorter.flush_fid_word_count(&mut buffer)
|
cached_sorter.flush_fid_word_count(&mut buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user