Store the postings lists

This commit is contained in:
Clément Renault 2020-01-09 14:53:49 +01:00
parent ec8916bf54
commit d6c9ba8f08
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 20 additions and 14 deletions

View File

@ -70,11 +70,12 @@ where
println!("number of postings {:?}", queries.len()); println!("number of postings {:?}", queries.len());
let before = Instant::now(); let before = Instant::now();
for (query, matches) in queries { for ((query, input), matches) in queries {
let op = sdset::duo::IntersectionByKey::new(&matches, &docids, |d| d.document_id, Clone::clone); let op = sdset::duo::IntersectionByKey::new(&matches, &docids, |d| d.document_id, Clone::clone);
let buf: SetBuf<DocIndex> = op.into_set_buf(); let buf: SetBuf<DocIndex> = op.into_set_buf();
if !buf.is_empty() { if !buf.is_empty() {
println!("{:?} gives {} matches", query, buf.len()); let input = std::str::from_utf8(&input);
println!("({:?}, {:?}) gives {} matches", query, input, buf.len());
} }
} }

View File

@ -213,14 +213,14 @@ pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str
Ok(create_operation(ngrams, Operation::Or)) Ok(create_operation(ngrams, Operation::Or))
} }
pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>), Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub struct QueryResult<'o, 'txn> { pub struct QueryResult<'o, 'txn> {
pub docids: Cow<'txn, Set<DocumentId>>, pub docids: Cow<'txn, Set<DocumentId>>,
pub queries: HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>, pub queries: Postings<'o, 'txn>,
} }
pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub fn traverse_query_tree<'o, 'txn>( pub fn traverse_query_tree<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>, reader: &'txn heed::RoTxn<MainT>,
ctx: &Context, ctx: &Context,
@ -318,8 +318,9 @@ pub fn traverse_query_tree<'o, 'txn>(
QueryKind::Tolerant(word) => { QueryKind::Tolerant(word) => {
if *prefix && word.len() == 1 { if *prefix && word.len() == 1 {
let prefix = [word.as_bytes()[0], 0, 0, 0]; let prefix = [word.as_bytes()[0], 0, 0, 0];
let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
matches.docids postings.insert((query, word.clone().into_bytes()), result.matches);
result.docids
} else { } else {
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) }; let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
@ -333,8 +334,9 @@ pub fn traverse_query_tree<'o, 'txn>(
let before = Instant::now(); let before = Instant::now();
let mut docids = Vec::new(); let mut docids = Vec::new();
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? { if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
docids.extend_from_slice(&postings.docids); docids.extend_from_slice(&result.docids);
postings.insert((query, input.to_owned()), result.matches);
} }
} }
println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2); println!("{:3$}docids extend ({:?}) took {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
@ -359,8 +361,9 @@ pub fn traverse_query_tree<'o, 'txn>(
let mut docids = Vec::new(); let mut docids = Vec::new();
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? { if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
docids.extend_from_slice(&postings.docids); docids.extend_from_slice(&result.docids);
postings.insert((query, input.to_owned()), result.matches);
} }
} }
@ -388,6 +391,10 @@ pub fn traverse_query_tree<'o, 'txn>(
docids.dedup(); docids.dedup();
let docids = SetBuf::new(docids).unwrap(); let docids = SetBuf::new(docids).unwrap();
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
postings.insert((query, vec![]), matches);
Cow::Owned(docids) Cow::Owned(docids)
} else { } else {
println!("{:2$}{:?} skipped", "", words, depth * 2); println!("{:2$}{:?} skipped", "", words, depth * 2);
@ -397,8 +404,6 @@ pub fn traverse_query_tree<'o, 'txn>(
}; };
println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2); println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
// postings.insert(query, matches);
Ok(docids) Ok(docids)
} }