From 70d4f47f3708814b3ecd6053acb3c0facbc56fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 Jan 2020 12:01:51 +0100 Subject: [PATCH] Differentiate short words as prefix or exact matches --- meilisearch-core/src/query_tree.rs | 29 ++++++++++++-- .../src/update/documents_addition.rs | 40 +++++++++---------- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 597df6f79..079c2c0eb 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -380,7 +380,7 @@ pub fn traverse_query_tree<'o, 'txn>( let before = Instant::now(); let Query { id, prefix, kind } = query; - let docids = match kind { + let docids: Cow> = match kind { QueryKind::Tolerant(word) => { if *prefix && word.len() <= 2 { let prefix = { @@ -390,10 +390,29 @@ pub fn traverse_query_tree<'o, 'txn>( array }; + let mut docids = Vec::new(); + + // We retrieve the cached postings list for all + // the words that starts with this short prefix. let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); let distance = 0; postings.insert((query, word.clone().into_bytes(), distance), result.matches); - result.docids + docids.extend_from_slice(&result.docids); + + // We retrieve the exact postings list for the prefix, + // because we must consider these matches as exact. + if let Some(result) = ctx.postings_lists.postings_list(reader, word.as_bytes())? { + let distance = 0; + postings.insert((query, word.clone().into_bytes(), distance), result.matches); + docids.extend_from_slice(&result.docids); + } + + let before = Instant::now(); + let docids = SetBuf::from_dirty(docids); + println!("{:2$}prefix docids construction took {:.02?}", "", before.elapsed(), depth * 2); + + Cow::Owned(docids) + } else { let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) }; @@ -442,7 +461,11 @@ pub fn traverse_query_tree<'o, 'txn>( } } - Cow::Owned(SetBuf::from_dirty(docids)) + let before = Instant::now(); + let docids = SetBuf::from_dirty(docids); + println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); + + Cow::Owned(docids) }, QueryKind::Phrase(words) => { // TODO support prefix and non-prefix exact DFA diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index c09f3114d..1a27ce33f 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -195,11 +195,16 @@ pub fn apply_documents_addition<'a, 'b>( pplc_store.clear(writer)?; for prefix_len in 1..=2 { - // compute prefixes and store those in the PrefixPostingsListsCache. + // compute prefixes and store those in the PrefixPostingsListsCache store. let mut previous_prefix: Option<([u8; 4], Vec<_>)> = None; let mut stream = words_fst.into_stream(); while let Some(input) = stream.next() { - if input.len() < prefix_len { continue } + + // We skip the prefixes that are shorter than the current length + // we want to cache (<). We must ignore the input when it is exactly the + // same word as the prefix because if we match exactly on it we need + // to consider it as an exact match and not as a prefix (=). + if input.len() <= prefix_len { continue } if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(|p| p.matches.into_owned()) { let prefix = &input[..prefix_len]; @@ -208,38 +213,33 @@ pub fn apply_documents_addition<'a, 'b>( arr_prefix[..prefix_len].copy_from_slice(prefix); match previous_prefix { - Some((ref mut prev_prefix, ref mut prev_postings_list)) if *prev_prefix != arr_prefix => { - prev_postings_list.sort_unstable(); - prev_postings_list.dedup(); + Some((ref mut prev_prefix, ref mut prev_pl)) if *prev_prefix != arr_prefix => { + prev_pl.sort_unstable(); + prev_pl.dedup(); if let Ok(prefix) = std::str::from_utf8(&prev_prefix[..prefix_len]) { - debug!("writing the prefix of {:?} of length {}", - prefix, prev_postings_list.len()); + debug!("writing the prefix of {:?} of length {}", prefix, prev_pl.len()); } - let pls = Set::new_unchecked(&prev_postings_list); + let pls = Set::new_unchecked(&prev_pl); pplc_store.put_prefix_postings_list(writer, *prev_prefix, &pls)?; *prev_prefix = arr_prefix; - prev_postings_list.clear(); - prev_postings_list.extend_from_slice(&postings_list); - }, - Some((_, ref mut prev_postings_list)) => { - prev_postings_list.extend_from_slice(&postings_list); - }, - None => { - previous_prefix = Some((arr_prefix, postings_list.to_vec())); + prev_pl.clear(); + prev_pl.extend_from_slice(&postings_list); }, + Some((_, ref mut prev_pl)) => prev_pl.extend_from_slice(&postings_list), + None => previous_prefix = Some((arr_prefix, postings_list.to_vec())), } } } // write the last prefix postings lists - if let Some((prev_prefix, mut prev_postings_list)) = previous_prefix.take() { - prev_postings_list.sort_unstable(); - prev_postings_list.dedup(); + if let Some((prev_prefix, mut prev_pl)) = previous_prefix.take() { + prev_pl.sort_unstable(); + prev_pl.dedup(); - let pls = Set::new_unchecked(&prev_postings_list); + let pls = Set::new_unchecked(&prev_pl); pplc_store.put_prefix_postings_list(writer, prev_prefix, &pls)?; } }