538: speedup exact words r=Kerollmops a=MarinPostma

This PR make `exact_words` return an `Option` instead of an empty set, since set creation is costly, as noticed by `@kerollmops.`

I was not convinces that this was the cause for all of the performance drop we measured, and then realized that methods that initialized it were called recursively which caused initialization times to add up. While the first fix solves the issue when not using exact words, using exact word remained way more expensive that it should be. To address this issue, the exact words are cached into the `Context`, so they are only initialized once.


Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-05-30 08:20:34 +00:00 committed by GitHub
commit 582930dbbb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 36 additions and 22 deletions

View File

@ -1041,10 +1041,10 @@ impl Index {
}
/// List the words on which typo are not allowed
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<Option<fst::Set<Cow<'t, [u8]>>>> {
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)),
None => Ok(None),
}
}

View File

@ -118,7 +118,7 @@ impl<'a> Search<'a> {
let before = Instant::now();
let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
Some(query) => {
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?;
builder.optional_words(self.optional_words);
builder.authorize_typos(self.is_typo_authorized()?);

View File

@ -152,7 +152,7 @@ trait Context {
}
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
}
/// The query tree builder is the interface to build a query tree.
@ -162,6 +162,7 @@ pub struct QueryTreeBuilder<'a> {
optional_words: bool,
authorize_typos: bool,
words_limit: Option<usize>,
exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
}
impl<'a> Context for QueryTreeBuilder<'a> {
@ -183,16 +184,23 @@ impl<'a> Context for QueryTreeBuilder<'a> {
Ok((one, two))
}
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
self.index.exact_words(self.rtxn)
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
}
}
impl<'a> QueryTreeBuilder<'a> {
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
/// and an Index `index`.
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None }
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result<Self> {
Ok(Self {
rtxn,
index,
optional_words: true,
authorize_typos: true,
words_limit: None,
exact_words: index.exact_words(rtxn)?,
})
}
/// if `optional_words` is set to `false` the query tree will be
@ -277,13 +285,13 @@ pub struct TypoConfig<'a> {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
pub exact_words: fst::Set<Cow<'a, [u8]>>,
pub exact_words: Option<&'a fst::Set<Cow<'a, [u8]>>>,
}
/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
if authorize_typos && !config.exact_words.contains(&word) {
if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) {
let count = word.chars().count().min(u8::MAX as usize) as u8;
if count < config.word_len_one_typo {
QueryKind::exact(word)
@ -342,7 +350,7 @@ fn create_query_tree(
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
children.push(Operation::Query(Query {
@ -396,7 +404,7 @@ fn create_query_tree(
let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
@ -501,7 +509,7 @@ fn create_matching_words(
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
@ -579,7 +587,7 @@ fn create_matching_words(
let word = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
@ -742,8 +750,7 @@ mod test {
struct TestContext {
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
postings: HashMap<String, RoaringBitmap>,
// Raw bytes for the exact word fst Set
exact_words: Vec<u8>,
exact_words: Option<fst::Set<Cow<'static, [u8]>>>,
}
impl TestContext {
@ -779,8 +786,8 @@ mod test {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
}
}
@ -799,6 +806,8 @@ mod test {
}
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
let exact_words =
Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
TestContext {
synonyms: hashmap! {
@ -1406,8 +1415,12 @@ mod test {
#[test]
fn test_min_word_len_typo() {
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
let config =
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
let config = TypoConfig {
max_typos: 2,
word_len_one_typo: 5,
word_len_two_typo: 7,
exact_words: Some(&exact_words),
};
assert_eq!(
typos("hello".to_string(), true, config.clone()),
@ -1433,6 +1446,7 @@ mod test {
let tokens = result.tokens();
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
let context = TestContext { exact_words, ..Default::default() };
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();

View File

@ -1495,7 +1495,7 @@ mod tests {
let words = btreeset! { S("Ab"), S("ac") };
builder.set_exact_words(words);
assert!(builder.execute(|_| ()).is_ok());
let exact_words = index.exact_words(&txn).unwrap();
let exact_words = index.exact_words(&txn).unwrap().unwrap();
for word in exact_words.into_fst().stream().into_str_vec().unwrap() {
assert!(word.0 == "ac" || word.0 == "ab");
}