mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-23 13:10:06 +01:00
Merge #538
538: speedup exact words r=Kerollmops a=MarinPostma This PR make `exact_words` return an `Option` instead of an empty set, since set creation is costly, as noticed by `@kerollmops.` I was not convinces that this was the cause for all of the performance drop we measured, and then realized that methods that initialized it were called recursively which caused initialization times to add up. While the first fix solves the issue when not using exact words, using exact word remained way more expensive that it should be. To address this issue, the exact words are cached into the `Context`, so they are only initialized once. Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
582930dbbb
@ -1041,10 +1041,10 @@ impl Index {
|
||||
}
|
||||
|
||||
/// List the words on which typo are not allowed
|
||||
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
||||
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<Option<fst::Set<Cow<'t, [u8]>>>> {
|
||||
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
|
||||
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
|
||||
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
|
||||
Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -118,7 +118,7 @@ impl<'a> Search<'a> {
|
||||
let before = Instant::now();
|
||||
let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
|
||||
Some(query) => {
|
||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?;
|
||||
builder.optional_words(self.optional_words);
|
||||
|
||||
builder.authorize_typos(self.is_typo_authorized()?);
|
||||
|
@ -152,7 +152,7 @@ trait Context {
|
||||
}
|
||||
/// Returns the minimum word len for 1 and 2 typos.
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
|
||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
||||
}
|
||||
|
||||
/// The query tree builder is the interface to build a query tree.
|
||||
@ -162,6 +162,7 @@ pub struct QueryTreeBuilder<'a> {
|
||||
optional_words: bool,
|
||||
authorize_typos: bool,
|
||||
words_limit: Option<usize>,
|
||||
exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
|
||||
}
|
||||
|
||||
impl<'a> Context for QueryTreeBuilder<'a> {
|
||||
@ -183,16 +184,23 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
||||
Ok((one, two))
|
||||
}
|
||||
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||
self.index.exact_words(self.rtxn)
|
||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
||||
self.exact_words.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> QueryTreeBuilder<'a> {
|
||||
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
|
||||
/// and an Index `index`.
|
||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
|
||||
Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None }
|
||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result<Self> {
|
||||
Ok(Self {
|
||||
rtxn,
|
||||
index,
|
||||
optional_words: true,
|
||||
authorize_typos: true,
|
||||
words_limit: None,
|
||||
exact_words: index.exact_words(rtxn)?,
|
||||
})
|
||||
}
|
||||
|
||||
/// if `optional_words` is set to `false` the query tree will be
|
||||
@ -277,13 +285,13 @@ pub struct TypoConfig<'a> {
|
||||
pub max_typos: u8,
|
||||
pub word_len_one_typo: u8,
|
||||
pub word_len_two_typo: u8,
|
||||
pub exact_words: fst::Set<Cow<'a, [u8]>>,
|
||||
pub exact_words: Option<&'a fst::Set<Cow<'a, [u8]>>>,
|
||||
}
|
||||
|
||||
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
||||
/// and the provided word length.
|
||||
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
|
||||
if authorize_typos && !config.exact_words.contains(&word) {
|
||||
if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) {
|
||||
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
||||
if count < config.word_len_one_typo {
|
||||
QueryKind::exact(word)
|
||||
@ -342,7 +350,7 @@ fn create_query_tree(
|
||||
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
|
||||
}
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let exact_words = ctx.exact_words();
|
||||
let config =
|
||||
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||
children.push(Operation::Query(Query {
|
||||
@ -396,7 +404,7 @@ fn create_query_tree(
|
||||
let concat = words.concat();
|
||||
let (word_len_one_typo, word_len_two_typo) =
|
||||
ctx.min_word_len_for_typo()?;
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let exact_words = ctx.exact_words();
|
||||
let config = TypoConfig {
|
||||
max_typos: 1,
|
||||
word_len_one_typo,
|
||||
@ -501,7 +509,7 @@ fn create_matching_words(
|
||||
}
|
||||
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let exact_words = ctx.exact_words();
|
||||
let config =
|
||||
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||
|
||||
@ -579,7 +587,7 @@ fn create_matching_words(
|
||||
let word = words.concat();
|
||||
let (word_len_one_typo, word_len_two_typo) =
|
||||
ctx.min_word_len_for_typo()?;
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let exact_words = ctx.exact_words();
|
||||
let config = TypoConfig {
|
||||
max_typos: 1,
|
||||
word_len_one_typo,
|
||||
@ -742,8 +750,7 @@ mod test {
|
||||
struct TestContext {
|
||||
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
||||
postings: HashMap<String, RoaringBitmap>,
|
||||
// Raw bytes for the exact word fst Set
|
||||
exact_words: Vec<u8>,
|
||||
exact_words: Option<fst::Set<Cow<'static, [u8]>>>,
|
||||
}
|
||||
|
||||
impl TestContext {
|
||||
@ -779,8 +786,8 @@ mod test {
|
||||
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
||||
}
|
||||
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
|
||||
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
||||
self.exact_words.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@ -799,6 +806,8 @@ mod test {
|
||||
}
|
||||
|
||||
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
|
||||
let exact_words =
|
||||
Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
||||
|
||||
TestContext {
|
||||
synonyms: hashmap! {
|
||||
@ -1406,8 +1415,12 @@ mod test {
|
||||
#[test]
|
||||
fn test_min_word_len_typo() {
|
||||
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let config =
|
||||
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
|
||||
let config = TypoConfig {
|
||||
max_typos: 2,
|
||||
word_len_one_typo: 5,
|
||||
word_len_two_typo: 7,
|
||||
exact_words: Some(&exact_words),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
typos("hello".to_string(), true, config.clone()),
|
||||
@ -1433,6 +1446,7 @@ mod test {
|
||||
|
||||
let tokens = result.tokens();
|
||||
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
||||
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
||||
let context = TestContext { exact_words, ..Default::default() };
|
||||
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
|
||||
|
||||
|
@ -1495,7 +1495,7 @@ mod tests {
|
||||
let words = btreeset! { S("Ab"), S("ac") };
|
||||
builder.set_exact_words(words);
|
||||
assert!(builder.execute(|_| ()).is_ok());
|
||||
let exact_words = index.exact_words(&txn).unwrap();
|
||||
let exact_words = index.exact_words(&txn).unwrap().unwrap();
|
||||
for word in exact_words.into_fst().stream().into_str_vec().unwrap() {
|
||||
assert!(word.0 == "ac" || word.0 == "ab");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user