849: Update nbHits count with filtered documents r=MarinPostma a=balajisivaraman

Closes #764 
close #1039

After discussing with @MarinPostma on Slack, this is my first attempt at implementing this for the basic flow that will go through `bucket_sort_with_distinct`.

A few thoughts here: 

- For getting the count of filtered documents alone, I originally thought of using `filter_map.values().filter(|&&v| !v).count()`. In a few cases, this was the same as what I have now implemented. But I realised I couldn't do something similar for `distinct`. So for being consistent, I have implemented both in a similar fashion.
- I also needed the `contains_key` check to ensure we're not counting the same document ID twice.

@MarinPostma also mentioned that this will be an approximation since the sort is lazy. In the test example that I've updated, the actual filtered count will be just 19 (for `male` records), but due to the `limit` in play, it returns 32 (filtering out 11 records overall).

Please let me know if this is the kind of fix we are looking for, and I can implement it in the placeholder search also.

Co-authored-by: Balaji Sivaraman <balaji@balajisivaraman.com>
This commit is contained in:
bors[bot] 2020-11-26 09:53:13 +00:00 committed by GitHub
commit f564a9ce51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 129 additions and 7 deletions

View File

@ -212,6 +212,7 @@ where
FD: Fn(DocumentId) -> Option<u64>,
{
let mut result = SortResult::default();
let mut filtered_count = 0;
let words_set = index.main.words_fst(reader)?;
let stop_words = index.main.stop_words_fst(reader)?;
@ -322,19 +323,36 @@ where
let filter_accepted = match &filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
*entry.or_insert_with(|| {
let accepted = (filter)(document.id);
// we only want to count it out the first time we see it
if !accepted {
filtered_count += 1;
}
accepted
})
}
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
let mut seen = true;
let key = entry.or_insert_with(|| {
seen = false;
(distinct)(document.id).map(Rc::new)
});
match key.clone() {
let distinct = match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
// we only want to count the document if it is the first time we see it and
// if it wasn't accepted by distinct
if !seen && !distinct {
filtered_count += 1;
}
}
// the requested range end is reached: stop computing distinct
@ -396,7 +414,7 @@ where
}
}
result.documents = documents;
result.nb_hits = docids.len();
result.nb_hits = docids.len() - filtered_count;
Ok(result)
}

View File

@ -225,10 +225,17 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> {
fn sort_result_from_docids(&self, docids: &[DocumentId], range: Range<usize>) -> SortResult {
let mut sort_result = SortResult::default();
let mut filtered_count = 0;
let mut result = match self.filter {
Some(ref filter) => docids
.iter()
.filter(|item| (filter)(**item))
.filter(|item| {
let accepted = (filter)(**item);
if !accepted {
filtered_count += 1;
}
accepted
})
.skip(range.start)
.take(range.end - range.start)
.map(|&id| Document::from_highlights(id, &[]))
@ -248,15 +255,19 @@ impl<'c, 'f, 'd, 'i> QueryBuilder<'c, 'f, 'd, 'i> {
result.retain(|doc| {
let id = doc.id;
let key = (distinct)(id);
match key {
let distinct_accepted = match key {
Some(key) => distinct_map.register(key),
None => distinct_map.register_without_key(),
};
if !distinct_accepted {
filtered_count += 1;
}
distinct_accepted
});
}
sort_result.documents = result;
sort_result.nb_hits = docids.len();
sort_result.nb_hits = docids.len() - filtered_count;
sort_result
}

View File

@ -588,3 +588,48 @@ async fn placeholder_search_with_empty_query() {
assert_eq!(response["hits"].as_array().unwrap().len(), 3);
});
}
#[actix_rt::test]
async fn test_filter_nb_hits_search_placeholder() {
let mut server = common::Server::with_uid("test");
let body = json!({
"uid": "test",
"primaryKey": "id",
});
server.create_index(body).await;
let documents = json!([
{
"id": 1,
"content": "a",
"color": "green",
"size": 1,
},
{
"id": 2,
"content": "a",
"color": "green",
"size": 2,
},
{
"id": 3,
"content": "a",
"color": "blue",
"size": 3,
},
]);
server.add_or_update_multiple_documents(documents).await;
let (response, _) = server.search_post(json!({})).await;
assert_eq!(response["nbHits"], 3);
server.update_distinct_attribute(json!("color")).await;
let (response, _) = server.search_post(json!({})).await;
assert_eq!(response["nbHits"], 2);
let (response, _) = server.search_post(json!({"filters": "size < 3"})).await;
println!("result: {}", response);
assert_eq!(response["nbHits"], 1);
}

View File

@ -1829,3 +1829,51 @@ async fn update_documents_with_facet_distribution() {
let (response2, _) = server.search_post(search).await;
assert_json_eq!(expected_facet_distribution, response2["facetsDistribution"].clone());
}
#[actix_rt::test]
async fn test_filter_nb_hits_search_normal() {
let mut server = common::Server::with_uid("test");
let body = json!({
"uid": "test",
"primaryKey": "id",
});
server.create_index(body).await;
let documents = json!([
{
"id": 1,
"content": "a",
"color": "green",
"size": 1,
},
{
"id": 2,
"content": "a",
"color": "green",
"size": 2,
},
{
"id": 3,
"content": "a",
"color": "blue",
"size": 3,
},
]);
server.add_or_update_multiple_documents(documents).await;
let (response, _) = server.search_post(json!({"q": "a"})).await;
assert_eq!(response["nbHits"], 3);
let (response, _) = server.search_post(json!({"q": "a", "filters": "size = 1"})).await;
assert_eq!(response["nbHits"], 1);
server.update_distinct_attribute(json!("color")).await;
let (response, _) = server.search_post(json!({"q": "a"})).await;
assert_eq!(response["nbHits"], 2);
let (response, _) = server.search_post(json!({"q": "a", "filters": "size < 3"})).await;
println!("result: {}", response);
assert_eq!(response["nbHits"], 1);
}