mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 04:37:32 +01:00
Merge #708
708: Reduce memory usage of the MatchingWords structure r=ManyTheFish a=loiclec # Pull Request ## Related issue Fixes (partially) https://github.com/meilisearch/meilisearch/issues/3115 ## What does this PR do? 1. Reduces the memory usage caused by the creation of a 10-word query tree by 20x. This is done by deduplicating the `MatchingWord` values, which are heavy because of their inner DFA. The deduplication works by wrapping each `MatchingWord` in a reference-counted box and using a hash map to determine whether a `MatchingWord` DFA already exists for a certain signature, or whether a new one needs to be built. 2. Avoid the worst-case scenario of creating a `MatchingWord` for extremely long words that cannot be indexed by milli. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
commit
5e754b3ee0
@ -1,3 +1,5 @@
|
|||||||
|
use std::rc::Rc;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main};
|
use criterion::{criterion_group, criterion_main};
|
||||||
use milli::tokenizer::TokenizerBuilder;
|
use milli::tokenizer::TokenizerBuilder;
|
||||||
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
|
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
|
||||||
@ -18,14 +20,14 @@ fn bench_formatting(c: &mut criterion::Criterion) {
|
|||||||
name: "'the door d'",
|
name: "'the door d'",
|
||||||
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
|
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
|
||||||
matching_words: MatcherBuilder::new(MatchingWords::new(vec![
|
matching_words: MatcherBuilder::new(MatchingWords::new(vec![
|
||||||
(vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]),
|
(vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]),
|
||||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
|
(vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]),
|
||||||
(vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]),
|
(vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]),
|
||||||
(vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]),
|
(vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]),
|
||||||
(vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]),
|
(vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]),
|
||||||
(vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]),
|
(vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]),
|
||||||
(vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]),
|
(vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
|
||||||
(vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]),
|
(vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
|
||||||
]
|
]
|
||||||
), TokenizerBuilder::default().build()),
|
), TokenizerBuilder::default().build()),
|
||||||
},
|
},
|
||||||
|
@ -70,6 +70,21 @@ pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
|
|||||||
/// expressed in term of latitude and longitude.
|
/// expressed in term of latitude and longitude.
|
||||||
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
|
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
|
||||||
|
|
||||||
|
/// The maximum length a LMDB key can be.
|
||||||
|
///
|
||||||
|
/// Note that the actual allowed length is a little bit higher, but
|
||||||
|
/// we keep a margin of safety.
|
||||||
|
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||||
|
|
||||||
|
/// The maximum length a field value can be when inserted in an LMDB key.
|
||||||
|
///
|
||||||
|
/// This number is determined by the keys of the different facet databases
|
||||||
|
/// and adding a margin of safety.
|
||||||
|
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
|
||||||
|
|
||||||
|
/// The maximum length a word can be
|
||||||
|
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||||
|
|
||||||
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||||
|
|
||||||
// Convert an absolute word position into a relative position.
|
// Convert an absolute word position into a relative position.
|
||||||
|
@ -2,11 +2,13 @@ use std::cmp::{min, Reverse};
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::ops::{Index, IndexMut};
|
use std::ops::{Index, IndexMut};
|
||||||
|
use std::rc::Rc;
|
||||||
|
|
||||||
use charabia::Token;
|
use charabia::Token;
|
||||||
use levenshtein_automata::{Distance, DFA};
|
use levenshtein_automata::{Distance, DFA};
|
||||||
|
|
||||||
use crate::search::build_dfa;
|
use crate::search::build_dfa;
|
||||||
|
use crate::MAX_WORD_LENGTH;
|
||||||
|
|
||||||
type IsPrefix = bool;
|
type IsPrefix = bool;
|
||||||
|
|
||||||
@ -14,11 +16,22 @@ type IsPrefix = bool;
|
|||||||
/// referencing words that match the given query tree.
|
/// referencing words that match the given query tree.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MatchingWords {
|
pub struct MatchingWords {
|
||||||
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for MatchingWords {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
writeln!(f, "[")?;
|
||||||
|
for (matching_words, primitive_word_id) in self.inner.iter() {
|
||||||
|
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
|
||||||
|
}
|
||||||
|
writeln!(f, "]")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MatchingWords {
|
impl MatchingWords {
|
||||||
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
|
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
|
||||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||||
// in order to highlight the longuest part of the matched word.
|
// in order to highlight the longuest part of the matched word.
|
||||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||||
@ -35,7 +48,8 @@ impl MatchingWords {
|
|||||||
/// Iterator over terms that match the given token,
|
/// Iterator over terms that match the given token,
|
||||||
/// This allow to lazily evaluate matches.
|
/// This allow to lazily evaluate matches.
|
||||||
pub struct MatchesIter<'a, 'b> {
|
pub struct MatchesIter<'a, 'b> {
|
||||||
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
|
#[allow(clippy::type_complexity)]
|
||||||
|
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
|
||||||
token: &'b Token<'b>,
|
token: &'b Token<'b>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,10 +105,13 @@ impl PartialEq for MatchingWord {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl MatchingWord {
|
impl MatchingWord {
|
||||||
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
|
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
|
||||||
|
if word.len() > MAX_WORD_LENGTH {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
let dfa = build_dfa(&word, typo, prefix);
|
let dfa = build_dfa(&word, typo, prefix);
|
||||||
|
|
||||||
Self { dfa, word, typo, prefix }
|
Some(Self { dfa, word, typo, prefix })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the lenght in chars of the match in case of the token matches the term.
|
/// Returns the lenght in chars of the match in case of the token matches the term.
|
||||||
@ -126,7 +143,7 @@ pub enum MatchType<'a> {
|
|||||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct PartialMatch<'a> {
|
pub struct PartialMatch<'a> {
|
||||||
matching_words: &'a [MatchingWord],
|
matching_words: &'a [Rc<MatchingWord>],
|
||||||
ids: &'a [PrimitiveWordId],
|
ids: &'a [PrimitiveWordId],
|
||||||
char_len: usize,
|
char_len: usize,
|
||||||
}
|
}
|
||||||
@ -332,10 +349,15 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn matching_words() {
|
fn matching_words() {
|
||||||
|
let all = vec![
|
||||||
|
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||||
|
];
|
||||||
let matching_words = vec![
|
let matching_words = vec![
|
||||||
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
|
(vec![all[0].clone()], vec![0]),
|
||||||
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
|
(vec![all[1].clone()], vec![1]),
|
||||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
(vec![all[2].clone()], vec![2]),
|
||||||
];
|
];
|
||||||
|
|
||||||
let matching_words = MatchingWords::new(matching_words);
|
let matching_words = MatchingWords::new(matching_words);
|
||||||
|
@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::rc::Rc;
|
||||||
|
|
||||||
use charabia::TokenizerBuilder;
|
use charabia::TokenizerBuilder;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::search::matches::matching_words::MatchingWord;
|
use crate::search::matches::matching_words::MatchingWord;
|
||||||
|
|
||||||
fn matching_words() -> MatchingWords {
|
fn matching_words() -> MatchingWords {
|
||||||
|
let all = vec![
|
||||||
|
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||||
|
];
|
||||||
let matching_words = vec![
|
let matching_words = vec![
|
||||||
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
|
(vec![all[0].clone()], vec![0]),
|
||||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
|
(vec![all[1].clone()], vec![1]),
|
||||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
(vec![all[2].clone()], vec![2]),
|
||||||
];
|
];
|
||||||
|
|
||||||
MatchingWords::new(matching_words)
|
MatchingWords::new(matching_words)
|
||||||
@ -587,10 +594,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn highlight_unicode() {
|
fn highlight_unicode() {
|
||||||
let matching_words = vec![
|
let all = vec![
|
||||||
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
|
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
|
||||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
|
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||||
];
|
];
|
||||||
|
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||||
|
|
||||||
let matching_words = MatchingWords::new(matching_words);
|
let matching_words = MatchingWords::new(matching_words);
|
||||||
|
|
||||||
@ -823,24 +831,20 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn partial_matches() {
|
fn partial_matches() {
|
||||||
|
let all = vec![
|
||||||
|
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
|
||||||
|
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
|
||||||
|
];
|
||||||
let matching_words = vec![
|
let matching_words = vec![
|
||||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
|
(vec![all[0].clone()], vec![0]),
|
||||||
(
|
(vec![all[1].clone(), all[2].clone()], vec![0]),
|
||||||
vec![
|
(vec![all[3].clone()], vec![1]),
|
||||||
MatchingWord::new("t".to_string(), 0, false),
|
(vec![all[4].clone(), all[5].clone()], vec![1]),
|
||||||
MatchingWord::new("he".to_string(), 0, false),
|
(vec![all[4].clone()], vec![2]),
|
||||||
],
|
|
||||||
vec![0],
|
|
||||||
),
|
|
||||||
(vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]),
|
|
||||||
(
|
|
||||||
vec![
|
|
||||||
MatchingWord::new("do".to_string(), 0, false),
|
|
||||||
MatchingWord::new("or".to_string(), 0, false),
|
|
||||||
],
|
|
||||||
vec![1],
|
|
||||||
),
|
|
||||||
(vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]),
|
|
||||||
];
|
];
|
||||||
|
|
||||||
let matching_words = MatchingWords::new(matching_words);
|
let matching_words = MatchingWords::new(matching_words);
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cmp::max;
|
use std::cmp::max;
|
||||||
|
use std::collections::hash_map::Entry;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::hash::Hash;
|
||||||
|
use std::rc::Rc;
|
||||||
use std::{fmt, mem};
|
use std::{fmt, mem};
|
||||||
|
|
||||||
use charabia::classifier::ClassifiedTokenIter;
|
use charabia::classifier::ClassifiedTokenIter;
|
||||||
@ -540,6 +544,29 @@ fn create_query_tree(
|
|||||||
Ok(Operation::or(true, operation_children))
|
Ok(Operation::or(true, operation_children))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Debug)]
|
||||||
|
struct MatchingWordCache {
|
||||||
|
all: Vec<Rc<MatchingWord>>,
|
||||||
|
map: HashMap<(String, u8, bool), Rc<MatchingWord>>,
|
||||||
|
}
|
||||||
|
impl MatchingWordCache {
|
||||||
|
fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option<Rc<MatchingWord>> {
|
||||||
|
match self.map.entry((word.clone(), typo, prefix)) {
|
||||||
|
Entry::Occupied(idx) => Some(idx.get().clone()),
|
||||||
|
Entry::Vacant(vacant) => {
|
||||||
|
let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
|
||||||
|
self.all.push(matching_word.clone());
|
||||||
|
vacant.insert(matching_word.clone());
|
||||||
|
Some(matching_word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// To deactivate the cache, for testing purposes, use the following instead:
|
||||||
|
// let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
|
||||||
|
// self.all.push(matching_word.clone());
|
||||||
|
// Some(matching_word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Main function that matchings words used for crop and highlight.
|
/// Main function that matchings words used for crop and highlight.
|
||||||
fn create_matching_words(
|
fn create_matching_words(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
@ -551,7 +578,8 @@ fn create_matching_words(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
part: PrimitiveQueryPart,
|
part: PrimitiveQueryPart,
|
||||||
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||||
|
matching_word_cache: &mut MatchingWordCache,
|
||||||
id: PrimitiveWordId,
|
id: PrimitiveWordId,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
match part {
|
match part {
|
||||||
@ -560,19 +588,28 @@ fn create_matching_words(
|
|||||||
PrimitiveQueryPart::Word(word, prefix) => {
|
PrimitiveQueryPart::Word(word, prefix) => {
|
||||||
if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
|
if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
|
||||||
for synonym in synonyms {
|
for synonym in synonyms {
|
||||||
let synonym = synonym
|
// Require that all words of the synonym have a corresponding MatchingWord
|
||||||
|
// before adding any of its words to the matching_words result.
|
||||||
|
if let Some(synonym_matching_words) = synonym
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|syn| MatchingWord::new(syn, 0, false))
|
.map(|word| matching_word_cache.insert(word, 0, false))
|
||||||
.collect();
|
.collect()
|
||||||
matching_words.push((synonym, vec![id]));
|
{
|
||||||
|
matching_words.push((synonym_matching_words, vec![id]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
||||||
let left = MatchingWord::new(left.to_string(), 0, false);
|
// Require that both left and right words have a corresponding MatchingWord
|
||||||
let right = MatchingWord::new(right.to_string(), 0, false);
|
// before adding them to the matching_words result
|
||||||
|
if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) {
|
||||||
|
if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false)
|
||||||
|
{
|
||||||
matching_words.push((vec![left, right], vec![id]));
|
matching_words.push((vec![left, right], vec![id]));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||||
let exact_words = ctx.exact_words();
|
let exact_words = ctx.exact_words();
|
||||||
@ -580,18 +617,29 @@ fn create_matching_words(
|
|||||||
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||||
|
|
||||||
let matching_word = match typos(word, authorize_typos, config) {
|
let matching_word = match typos(word, authorize_typos, config) {
|
||||||
QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
|
QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix),
|
||||||
QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
|
QueryKind::Tolerant { typo, word } => {
|
||||||
|
matching_word_cache.insert(word, typo, prefix)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
if let Some(matching_word) = matching_word {
|
||||||
matching_words.push((vec![matching_word], vec![id]));
|
matching_words.push((vec![matching_word], vec![id]));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// create a CONSECUTIVE matchings words wrapping all word in the phrase
|
// create a CONSECUTIVE matchings words wrapping all word in the phrase
|
||||||
PrimitiveQueryPart::Phrase(words) => {
|
PrimitiveQueryPart::Phrase(words) => {
|
||||||
let ids: Vec<_> =
|
let ids: Vec<_> =
|
||||||
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
|
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
|
||||||
let words =
|
// Require that all words of the phrase have a corresponding MatchingWord
|
||||||
words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect();
|
// before adding any of them to the matching_words result
|
||||||
matching_words.push((words, ids));
|
if let Some(phrase_matching_words) = words
|
||||||
|
.into_iter()
|
||||||
|
.flatten()
|
||||||
|
.map(|w| matching_word_cache.insert(w, 0, false))
|
||||||
|
.collect()
|
||||||
|
{
|
||||||
|
matching_words.push((phrase_matching_words, ids));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -603,7 +651,8 @@ fn create_matching_words(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: &[PrimitiveQueryPart],
|
query: &[PrimitiveQueryPart],
|
||||||
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||||
|
matching_word_cache: &mut MatchingWordCache,
|
||||||
mut id: PrimitiveWordId,
|
mut id: PrimitiveWordId,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
@ -621,6 +670,7 @@ fn create_matching_words(
|
|||||||
authorize_typos,
|
authorize_typos,
|
||||||
part.clone(),
|
part.clone(),
|
||||||
matching_words,
|
matching_words,
|
||||||
|
matching_word_cache,
|
||||||
id,
|
id,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -643,13 +693,15 @@ fn create_matching_words(
|
|||||||
|
|
||||||
if let Some(synonyms) = ctx.synonyms(&words)? {
|
if let Some(synonyms) = ctx.synonyms(&words)? {
|
||||||
for synonym in synonyms {
|
for synonym in synonyms {
|
||||||
let synonym = synonym
|
if let Some(synonym) = synonym
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|syn| MatchingWord::new(syn, 0, false))
|
.map(|syn| matching_word_cache.insert(syn, 0, false))
|
||||||
.collect();
|
.collect()
|
||||||
|
{
|
||||||
matching_words.push((synonym, ids.clone()));
|
matching_words.push((synonym, ids.clone()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
let word = words.concat();
|
let word = words.concat();
|
||||||
let (word_len_one_typo, word_len_two_typo) =
|
let (word_len_one_typo, word_len_two_typo) =
|
||||||
ctx.min_word_len_for_typo()?;
|
ctx.min_word_len_for_typo()?;
|
||||||
@ -662,18 +714,27 @@ fn create_matching_words(
|
|||||||
};
|
};
|
||||||
let matching_word = match typos(word, authorize_typos, config) {
|
let matching_word = match typos(word, authorize_typos, config) {
|
||||||
QueryKind::Exact { word, .. } => {
|
QueryKind::Exact { word, .. } => {
|
||||||
MatchingWord::new(word, 0, is_prefix)
|
matching_word_cache.insert(word, 0, is_prefix)
|
||||||
}
|
}
|
||||||
QueryKind::Tolerant { typo, word } => {
|
QueryKind::Tolerant { typo, word } => {
|
||||||
MatchingWord::new(word, typo, is_prefix)
|
matching_word_cache.insert(word, typo, is_prefix)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
if let Some(matching_word) = matching_word {
|
||||||
matching_words.push((vec![matching_word], ids));
|
matching_words.push((vec![matching_word], ids));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !is_last {
|
if !is_last {
|
||||||
ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
|
ngrams(
|
||||||
|
ctx,
|
||||||
|
authorize_typos,
|
||||||
|
tail,
|
||||||
|
matching_words,
|
||||||
|
matching_word_cache,
|
||||||
|
id + 1,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -683,8 +744,9 @@ fn create_matching_words(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut matching_word_cache = MatchingWordCache::default();
|
||||||
let mut matching_words = Vec::new();
|
let mut matching_words = Vec::new();
|
||||||
ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
|
ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
|
||||||
Ok(MatchingWords::new(matching_words))
|
Ok(MatchingWords::new(matching_words))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -814,6 +876,7 @@ mod test {
|
|||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::index::tests::TempIndex;
|
||||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -1294,6 +1357,27 @@ mod test {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dont_create_matching_word_for_long_words() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house";
|
||||||
|
let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
|
||||||
|
builder.words_limit(10);
|
||||||
|
let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{matching_words:?}"), @r###"
|
||||||
|
[
|
||||||
|
([MatchingWord { word: "house", typo: 1, prefix: true }], [3])
|
||||||
|
([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
|
||||||
|
([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1])
|
||||||
|
([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
|
||||||
|
([MatchingWord { word: "house", typo: 1, prefix: true }], [1])
|
||||||
|
([MatchingWord { word: "what", typo: 0, prefix: false }], [0])
|
||||||
|
([MatchingWord { word: "a", typo: 0, prefix: false }], [1])
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn disable_typo_on_word() {
|
fn disable_typo_on_word() {
|
||||||
let query = "goodbye";
|
let query = "goodbye";
|
||||||
@ -1310,4 +1394,67 @@ mod test {
|
|||||||
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The memory usage test below is disabled because `cargo test` runs multiple tests in parallel,
|
||||||
|
// which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run
|
||||||
|
// manually from time to time, so I kept it here, commented-out.
|
||||||
|
|
||||||
|
// use std::alloc::{GlobalAlloc, System};
|
||||||
|
// use std::sync::atomic::{self, AtomicI64};
|
||||||
|
//
|
||||||
|
// #[global_allocator]
|
||||||
|
// static ALLOC: CountingAlloc =
|
||||||
|
// CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) };
|
||||||
|
//
|
||||||
|
// pub struct CountingAlloc {
|
||||||
|
// pub resident: AtomicI64,
|
||||||
|
// pub allocated: AtomicI64,
|
||||||
|
// }
|
||||||
|
// unsafe impl GlobalAlloc for CountingAlloc {
|
||||||
|
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
|
||||||
|
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||||
|
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||||
|
//
|
||||||
|
// System.alloc(layout)
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
|
||||||
|
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||||
|
// System.dealloc(ptr, layout)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// #[test]
|
||||||
|
// fn memory_usage_of_ten_word_query() {
|
||||||
|
// let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst);
|
||||||
|
// let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst);
|
||||||
|
//
|
||||||
|
// let index = TempIndex::new();
|
||||||
|
// let rtxn = index.read_txn().unwrap();
|
||||||
|
// let query = "a beautiful summer house by the beach overlooking what seems";
|
||||||
|
// let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
|
||||||
|
// builder.words_limit(10);
|
||||||
|
// let x = builder.build(query.tokenize()).unwrap().unwrap();
|
||||||
|
// let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst);
|
||||||
|
// let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst);
|
||||||
|
//
|
||||||
|
// // Weak check on the memory usage
|
||||||
|
// // Don't keep more than 5MB. (Arguably 5MB is already too high)
|
||||||
|
// assert!(resident_after - resident_before < 5_000_000);
|
||||||
|
// // Don't allocate more than 10MB.
|
||||||
|
// assert!(allocated_after - allocated_before < 10_000_000);
|
||||||
|
//
|
||||||
|
// // Use these snapshots to measure the exact memory usage.
|
||||||
|
// // The values below were correct at the time I wrote them.
|
||||||
|
// // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950");
|
||||||
|
// // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502");
|
||||||
|
//
|
||||||
|
// // Note, with the matching word cache deactivated, the memory usage was:
|
||||||
|
// // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697");
|
||||||
|
// // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588");
|
||||||
|
// // or about 20x more resident memory (90MB vs 4.5MB)
|
||||||
|
//
|
||||||
|
// // Use x
|
||||||
|
// let _x = x;
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||||
concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH,
|
|
||||||
};
|
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
|
use crate::{
|
||||||
|
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||||
|
};
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
|
@ -6,9 +6,8 @@ use heed::BytesEncode;
|
|||||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::StrRefCodec;
|
||||||
use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
|
|
||||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
|
|
||||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||||
///
|
///
|
||||||
|
@ -12,9 +12,8 @@ use serde_json::Value;
|
|||||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::InternalError;
|
use crate::error::InternalError;
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
|
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
|
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||||
|
|
||||||
/// Extracts the facet values of each faceted field of each document.
|
/// Extracts the facet values of each faceted field of each document.
|
||||||
///
|
///
|
||||||
|
@ -18,20 +18,7 @@ pub use merge_functions::{
|
|||||||
serialize_roaring_bitmap, MergeFn,
|
serialize_roaring_bitmap, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The maximum length a LMDB key can be.
|
use crate::MAX_WORD_LENGTH;
|
||||||
///
|
|
||||||
/// Note that the actual allowed length is a little bit higher, but
|
|
||||||
/// we keep a margin of safety.
|
|
||||||
const MAX_LMDB_KEY_LENGTH: usize = 500;
|
|
||||||
|
|
||||||
/// The maximum length a field value can be when inserted in an LMDB key.
|
|
||||||
///
|
|
||||||
/// This number is determined by the keys of the different facet databases
|
|
||||||
/// and adding a margin of safety.
|
|
||||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
|
|
||||||
|
|
||||||
/// The maximum length a word can be
|
|
||||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
|
||||||
|
|
||||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user