Faster intersection group by

This commit is contained in:
Clément Renault 2020-01-09 16:16:42 +01:00
parent d6c9ba8f08
commit 4f7a7ea0bb
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -15,7 +15,7 @@ use levenshtein_automata::DFA;
use log::debug; use log::debug;
use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_tokenizer::{is_cjk, split_query_string};
use meilisearch_types::DocIndex; use meilisearch_types::DocIndex;
use sdset::{Set, SetBuf, SetOperation}; use sdset::{Set, SetBuf};
use slice_group_by::{GroupBy, GroupByMut}; use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::NGRAMS; use crate::automaton::NGRAMS;
@ -64,18 +64,15 @@ where
let operation = create_query_tree(reader, &context, query).unwrap(); let operation = create_query_tree(reader, &context, query).unwrap();
println!("{:?}", operation); println!("{:?}", operation);
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap();
println!("found {} documents", docids.len()); println!("found {} documents", docids.len());
println!("number of postings {:?}", queries.len()); println!("number of postings {:?}", queries.len());
let before = Instant::now(); let before = Instant::now();
for ((query, input), matches) in queries { for ((query, input), matches) in queries {
let op = sdset::duo::IntersectionByKey::new(&matches, &docids, |d| d.document_id, Clone::clone); // TODO optimize the filter by skipping docids that have already been seen
let buf: SetBuf<DocIndex> = op.into_set_buf(); for matches in matches.linear_group_by_key(|m| m.document_id).filter(|ms| docids.contains(&ms[0].document_id)) {
if !buf.is_empty() { // ...
let input = std::str::from_utf8(&input);
println!("({:?}, {:?}) gives {} matches", query, input, buf.len());
} }
} }