mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge #708
708: Reduce memory usage of the MatchingWords structure r=ManyTheFish a=loiclec # Pull Request ## Related issue Fixes (partially) https://github.com/meilisearch/meilisearch/issues/3115 ## What does this PR do? 1. Reduces the memory usage caused by the creation of a 10-word query tree by 20x. This is done by deduplicating the `MatchingWord` values, which are heavy because of their inner DFA. The deduplication works by wrapping each `MatchingWord` in a reference-counted box and using a hash map to determine whether a `MatchingWord` DFA already exists for a certain signature, or whether a new one needs to be built. 2. Avoid the worst-case scenario of creating a `MatchingWord` for extremely long words that cannot be indexed by milli. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
commit
5e754b3ee0
9 changed files with 261 additions and 86 deletions
|
@ -70,6 +70,21 @@ pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
|||
/// expressed in term of latitude and longitude.
|
||||
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
|
||||
|
||||
/// The maximum length a LMDB key can be.
|
||||
///
|
||||
/// Note that the actual allowed length is a little bit higher, but
|
||||
/// we keep a margin of safety.
|
||||
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||
|
||||
/// The maximum length a field value can be when inserted in an LMDB key.
|
||||
///
|
||||
/// This number is determined by the keys of the different facet databases
|
||||
/// and adding a margin of safety.
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
|
||||
|
||||
/// The maximum length a word can be
|
||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||
|
||||
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||
|
||||
// Convert an absolute word position into a relative position.
|
||||
|
|
|
@ -2,11 +2,13 @@ use std::cmp::{min, Reverse};
|
|||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::Token;
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
|
||||
use crate::search::build_dfa;
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
type IsPrefix = bool;
|
||||
|
||||
|
@ -14,11 +16,22 @@ type IsPrefix = bool;
|
|||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
||||
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "[")?;
|
||||
for (matching_words, primitive_word_id) in self.inner.iter() {
|
||||
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
|
||||
}
|
||||
writeln!(f, "]")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
|
||||
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
|
||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||
|
@ -35,7 +48,8 @@ impl MatchingWords {
|
|||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
|
@ -91,10 +105,13 @@ impl PartialEq for MatchingWord {
|
|||
}
|
||||
|
||||
impl MatchingWord {
|
||||
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
|
||||
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return None;
|
||||
}
|
||||
let dfa = build_dfa(&word, typo, prefix);
|
||||
|
||||
Self { dfa, word, typo, prefix }
|
||||
Some(Self { dfa, word, typo, prefix })
|
||||
}
|
||||
|
||||
/// Returns the lenght in chars of the match in case of the token matches the term.
|
||||
|
@ -126,7 +143,7 @@ pub enum MatchType<'a> {
|
|||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: &'a [MatchingWord],
|
||||
matching_words: &'a [Rc<MatchingWord>],
|
||||
ids: &'a [PrimitiveWordId],
|
||||
char_len: usize,
|
||||
}
|
||||
|
@ -332,10 +349,15 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
|
||||
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
|
|
@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
|
||||
use super::*;
|
||||
use crate::search::matches::matching_words::MatchingWord;
|
||||
|
||||
fn matching_words() -> MatchingWords {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
|
||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
MatchingWords::new(matching_words)
|
||||
|
@ -587,10 +594,11 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
||||
|
@ -823,24 +831,20 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
|
||||
(
|
||||
vec![
|
||||
MatchingWord::new("t".to_string(), 0, false),
|
||||
MatchingWord::new("he".to_string(), 0, false),
|
||||
],
|
||||
vec![0],
|
||||
),
|
||||
(vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]),
|
||||
(
|
||||
vec![
|
||||
MatchingWord::new("do".to_string(), 0, false),
|
||||
MatchingWord::new("or".to_string(), 0, false),
|
||||
],
|
||||
vec![1],
|
||||
),
|
||||
(vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone(), all[2].clone()], vec![0]),
|
||||
(vec![all[3].clone()], vec![1]),
|
||||
(vec![all[4].clone(), all[5].clone()], vec![1]),
|
||||
(vec![all[4].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
use std::borrow::Cow;
|
||||
use std::cmp::max;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
use std::rc::Rc;
|
||||
use std::{fmt, mem};
|
||||
|
||||
use charabia::classifier::ClassifiedTokenIter;
|
||||
|
@ -540,6 +544,29 @@ fn create_query_tree(
|
|||
Ok(Operation::or(true, operation_children))
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct MatchingWordCache {
|
||||
all: Vec<Rc<MatchingWord>>,
|
||||
map: HashMap<(String, u8, bool), Rc<MatchingWord>>,
|
||||
}
|
||||
impl MatchingWordCache {
|
||||
fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option<Rc<MatchingWord>> {
|
||||
match self.map.entry((word.clone(), typo, prefix)) {
|
||||
Entry::Occupied(idx) => Some(idx.get().clone()),
|
||||
Entry::Vacant(vacant) => {
|
||||
let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
|
||||
self.all.push(matching_word.clone());
|
||||
vacant.insert(matching_word.clone());
|
||||
Some(matching_word)
|
||||
}
|
||||
}
|
||||
// To deactivate the cache, for testing purposes, use the following instead:
|
||||
// let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
|
||||
// self.all.push(matching_word.clone());
|
||||
// Some(matching_word)
|
||||
}
|
||||
}
|
||||
|
||||
/// Main function that matchings words used for crop and highlight.
|
||||
fn create_matching_words(
|
||||
ctx: &impl Context,
|
||||
|
@ -551,7 +578,8 @@ fn create_matching_words(
|
|||
ctx: &impl Context,
|
||||
authorize_typos: bool,
|
||||
part: PrimitiveQueryPart,
|
||||
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
||||
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
matching_word_cache: &mut MatchingWordCache,
|
||||
id: PrimitiveWordId,
|
||||
) -> Result<()> {
|
||||
match part {
|
||||
|
@ -560,18 +588,27 @@ fn create_matching_words(
|
|||
PrimitiveQueryPart::Word(word, prefix) => {
|
||||
if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
|
||||
for synonym in synonyms {
|
||||
let synonym = synonym
|
||||
// Require that all words of the synonym have a corresponding MatchingWord
|
||||
// before adding any of its words to the matching_words result.
|
||||
if let Some(synonym_matching_words) = synonym
|
||||
.into_iter()
|
||||
.map(|syn| MatchingWord::new(syn, 0, false))
|
||||
.collect();
|
||||
matching_words.push((synonym, vec![id]));
|
||||
.map(|word| matching_word_cache.insert(word, 0, false))
|
||||
.collect()
|
||||
{
|
||||
matching_words.push((synonym_matching_words, vec![id]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
||||
let left = MatchingWord::new(left.to_string(), 0, false);
|
||||
let right = MatchingWord::new(right.to_string(), 0, false);
|
||||
matching_words.push((vec![left, right], vec![id]));
|
||||
// Require that both left and right words have a corresponding MatchingWord
|
||||
// before adding them to the matching_words result
|
||||
if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) {
|
||||
if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false)
|
||||
{
|
||||
matching_words.push((vec![left, right], vec![id]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
|
@ -580,18 +617,29 @@ fn create_matching_words(
|
|||
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||
|
||||
let matching_word = match typos(word, authorize_typos, config) {
|
||||
QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
|
||||
QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
|
||||
QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix),
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
matching_word_cache.insert(word, typo, prefix)
|
||||
}
|
||||
};
|
||||
matching_words.push((vec![matching_word], vec![id]));
|
||||
if let Some(matching_word) = matching_word {
|
||||
matching_words.push((vec![matching_word], vec![id]));
|
||||
}
|
||||
}
|
||||
// create a CONSECUTIVE matchings words wrapping all word in the phrase
|
||||
PrimitiveQueryPart::Phrase(words) => {
|
||||
let ids: Vec<_> =
|
||||
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
|
||||
let words =
|
||||
words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect();
|
||||
matching_words.push((words, ids));
|
||||
// Require that all words of the phrase have a corresponding MatchingWord
|
||||
// before adding any of them to the matching_words result
|
||||
if let Some(phrase_matching_words) = words
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|w| matching_word_cache.insert(w, 0, false))
|
||||
.collect()
|
||||
{
|
||||
matching_words.push((phrase_matching_words, ids));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -603,7 +651,8 @@ fn create_matching_words(
|
|||
ctx: &impl Context,
|
||||
authorize_typos: bool,
|
||||
query: &[PrimitiveQueryPart],
|
||||
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
||||
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
matching_word_cache: &mut MatchingWordCache,
|
||||
mut id: PrimitiveWordId,
|
||||
) -> Result<()> {
|
||||
const MAX_NGRAM: usize = 3;
|
||||
|
@ -621,6 +670,7 @@ fn create_matching_words(
|
|||
authorize_typos,
|
||||
part.clone(),
|
||||
matching_words,
|
||||
matching_word_cache,
|
||||
id,
|
||||
)?;
|
||||
}
|
||||
|
@ -643,11 +693,13 @@ fn create_matching_words(
|
|||
|
||||
if let Some(synonyms) = ctx.synonyms(&words)? {
|
||||
for synonym in synonyms {
|
||||
let synonym = synonym
|
||||
if let Some(synonym) = synonym
|
||||
.into_iter()
|
||||
.map(|syn| MatchingWord::new(syn, 0, false))
|
||||
.collect();
|
||||
matching_words.push((synonym, ids.clone()));
|
||||
.map(|syn| matching_word_cache.insert(syn, 0, false))
|
||||
.collect()
|
||||
{
|
||||
matching_words.push((synonym, ids.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
let word = words.concat();
|
||||
|
@ -662,18 +714,27 @@ fn create_matching_words(
|
|||
};
|
||||
let matching_word = match typos(word, authorize_typos, config) {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
MatchingWord::new(word, 0, is_prefix)
|
||||
matching_word_cache.insert(word, 0, is_prefix)
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
MatchingWord::new(word, typo, is_prefix)
|
||||
matching_word_cache.insert(word, typo, is_prefix)
|
||||
}
|
||||
};
|
||||
matching_words.push((vec![matching_word], ids));
|
||||
if let Some(matching_word) = matching_word {
|
||||
matching_words.push((vec![matching_word], ids));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !is_last {
|
||||
ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
|
||||
ngrams(
|
||||
ctx,
|
||||
authorize_typos,
|
||||
tail,
|
||||
matching_words,
|
||||
matching_word_cache,
|
||||
id + 1,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -683,8 +744,9 @@ fn create_matching_words(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
let mut matching_word_cache = MatchingWordCache::default();
|
||||
let mut matching_words = Vec::new();
|
||||
ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
|
||||
ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
|
||||
Ok(MatchingWords::new(matching_words))
|
||||
}
|
||||
|
||||
|
@ -814,6 +876,7 @@ mod test {
|
|||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -1294,6 +1357,27 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dont_create_matching_word_for_long_words() {
|
||||
let index = TempIndex::new();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house";
|
||||
let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
|
||||
builder.words_limit(10);
|
||||
let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap();
|
||||
insta::assert_snapshot!(format!("{matching_words:?}"), @r###"
|
||||
[
|
||||
([MatchingWord { word: "house", typo: 1, prefix: true }], [3])
|
||||
([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
|
||||
([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1])
|
||||
([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
|
||||
([MatchingWord { word: "house", typo: 1, prefix: true }], [1])
|
||||
([MatchingWord { word: "what", typo: 0, prefix: false }], [0])
|
||||
([MatchingWord { word: "a", typo: 0, prefix: false }], [1])
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disable_typo_on_word() {
|
||||
let query = "goodbye";
|
||||
|
@ -1310,4 +1394,67 @@ mod test {
|
|||
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
||||
));
|
||||
}
|
||||
|
||||
// The memory usage test below is disabled because `cargo test` runs multiple tests in parallel,
|
||||
// which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run
|
||||
// manually from time to time, so I kept it here, commented-out.
|
||||
|
||||
// use std::alloc::{GlobalAlloc, System};
|
||||
// use std::sync::atomic::{self, AtomicI64};
|
||||
//
|
||||
// #[global_allocator]
|
||||
// static ALLOC: CountingAlloc =
|
||||
// CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) };
|
||||
//
|
||||
// pub struct CountingAlloc {
|
||||
// pub resident: AtomicI64,
|
||||
// pub allocated: AtomicI64,
|
||||
// }
|
||||
// unsafe impl GlobalAlloc for CountingAlloc {
|
||||
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
|
||||
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||
//
|
||||
// System.alloc(layout)
|
||||
// }
|
||||
//
|
||||
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
|
||||
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||
// System.dealloc(ptr, layout)
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// #[test]
|
||||
// fn memory_usage_of_ten_word_query() {
|
||||
// let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst);
|
||||
// let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst);
|
||||
//
|
||||
// let index = TempIndex::new();
|
||||
// let rtxn = index.read_txn().unwrap();
|
||||
// let query = "a beautiful summer house by the beach overlooking what seems";
|
||||
// let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
|
||||
// builder.words_limit(10);
|
||||
// let x = builder.build(query.tokenize()).unwrap().unwrap();
|
||||
// let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst);
|
||||
// let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst);
|
||||
//
|
||||
// // Weak check on the memory usage
|
||||
// // Don't keep more than 5MB. (Arguably 5MB is already too high)
|
||||
// assert!(resident_after - resident_before < 5_000_000);
|
||||
// // Don't allocate more than 10MB.
|
||||
// assert!(allocated_after - allocated_before < 10_000_000);
|
||||
//
|
||||
// // Use these snapshots to measure the exact memory usage.
|
||||
// // The values below were correct at the time I wrote them.
|
||||
// // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950");
|
||||
// // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502");
|
||||
//
|
||||
// // Note, with the matching word cache deactivated, the memory usage was:
|
||||
// // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697");
|
||||
// // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588");
|
||||
// // or about 20x more resident memory (90MB vs 4.5MB)
|
||||
//
|
||||
// // Use x
|
||||
// let _x = x;
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
|||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{
|
||||
concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH,
|
||||
};
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
use crate::{
|
||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
|
|
|
@ -6,9 +6,8 @@ use heed::BytesEncode;
|
|||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
|
||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result};
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
|
|
|
@ -12,9 +12,8 @@ use serde_json::Value;
|
|||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
|
|
|
@ -18,20 +18,7 @@ pub use merge_functions::{
|
|||
serialize_roaring_bitmap, MergeFn,
|
||||
};
|
||||
|
||||
/// The maximum length a LMDB key can be.
|
||||
///
|
||||
/// Note that the actual allowed length is a little bit higher, but
|
||||
/// we keep a margin of safety.
|
||||
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||
|
||||
/// The maximum length a field value can be when inserted in an LMDB key.
|
||||
///
|
||||
/// This number is determined by the keys of the different facet databases
|
||||
/// and adding a margin of safety.
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
|
||||
|
||||
/// The maximum length a word can be
|
||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue