From fbcec2975d822d1105d5230f0191cb6ba79ad749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 Jan 2020 18:23:55 +0100 Subject: [PATCH] wip: Impl a basic tree traversing --- Cargo.lock | 6 +- meilisearch-core/Cargo.toml | 6 +- meilisearch-core/src/bucket_sort.rs | 19 +++++- meilisearch-core/src/query_tree.rs | 95 ++++++++++++++++------------- 4 files changed, 76 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 750cdc30c..6cdab9a30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -962,7 +962,7 @@ dependencies = [ "once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "sdset 0.3.6 (git+https://github.com/Kerollmops/sdset?branch=intersection-by-key)", "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1693,7 +1693,7 @@ dependencies = [ [[package]] name = "sdset" version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" +source = "git+https://github.com/Kerollmops/sdset?branch=intersection-by-key#03c5008a4b23e11ba89c5579b023473b555d3864" [[package]] name = "semver" @@ -2807,7 +2807,7 @@ dependencies = [ "checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9" +"checksum sdset 0.3.6 (git+https://github.com/Kerollmops/sdset?branch=intersection-by-key)" = "" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 3b19369f8..a0d50ed01 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -25,13 +25,17 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.4" } meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" } once_cell = "1.2.0" ordered-float = { version = "1.0.2", features = ["serde"] } -sdset = "0.3.6" serde = { version = "1.0.101", features = ["derive"] } serde_json = "1.0.41" siphasher = "0.3.1" slice-group-by = "0.2.6" zerocopy = "0.2.8" +[dependencies.sdset] +# version = "0.3.6" +git = "https://github.com/Kerollmops/sdset" +branch = "intersection-by-key" + [dev-dependencies] assert_matches = "1.3" criterion = "0.3" diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 17cb8c47c..5129f1b55 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -15,7 +15,7 @@ use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::DocIndex; -use sdset::{Set, SetBuf}; +use sdset::{Set, SetBuf, SetOperation}; use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::NGRAMS; @@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; -use crate::query_tree::create_query_tree; +use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult}; pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, @@ -50,6 +50,21 @@ where let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap(); println!("{:?}", operation); + let QueryResult { docids, queries } = traverse_query_tree(reader, postings_lists_store, &operation).unwrap(); + println!("found {} documents", docids.len()); + println!("number of postings {:?}", queries.len()); + + let before = Instant::now(); + for (query, matches) in queries { + let op = sdset::duo::IntersectionByKey::new(&matches, &docids, |d| d.document_id, Clone::clone); + let buf: SetBuf = op.into_set_buf(); + if !buf.is_empty() { + println!("{:?} gives {} matches", query, buf.len()); + } + } + + println!("matches cleaned in {:.02?}", before.elapsed()); + // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. if filter.is_some() { diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 17bf5f483..148e66da5 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -204,22 +204,28 @@ pub fn create_query_tree( Ok(create_operation(ngrams, Operation::Or)) } -pub struct QueryResult<'q, 'c> { - pub docids: Cow<'c, Set>, - pub queries: HashMap<&'q Query, Cow<'c, Set>>, +pub struct QueryResult<'o, 'txn> { + pub docids: SetBuf, + pub queries: HashMap<&'o Query, Cow<'txn, Set>>, } -pub type Postings<'q, 'c> = HashMap<&'q Query, Cow<'c, Set>>; -pub type Cache<'o, 'c> = HashMap<&'o Operation, Cow<'c, Set>>; +pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set>>; +pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf>; -pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> QueryResult<'a, 'c> { - fn execute_and<'o, 'c>( - ctx: &'c Context, - cache: &mut Cache<'o, 'c>, - postings: &mut Postings<'o, 'c>, +pub fn traverse_query_tree<'o, 'txn>( + reader: &'txn heed::RoTxn, + postings_lists: store::PostingsLists, + tree: &'o Operation, +) -> MResult> +{ + fn execute_and<'o, 'txn>( + reader: &'txn heed::RoTxn, + pls: store::PostingsLists, + cache: &mut Cache<'o, 'txn>, + postings: &mut Postings<'o, 'txn>, depth: usize, operations: &'o [Operation], - ) -> Cow<'c, Set> + ) -> MResult> { println!("{:1$}AND", "", depth * 2); @@ -229,9 +235,9 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que for op in operations { if cache.get(op).is_none() { let docids = match op { - Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops), - Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops), - Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query), + Operation::And(ops) => execute_and(reader, pls, cache, postings, depth + 1, &ops)?, + Operation::Or(ops) => execute_or(reader, pls, cache, postings, depth + 1, &ops)?, + Operation::Query(query) => execute_query(reader, pls, postings, depth + 1, &query)?, }; cache.insert(op, docids); } @@ -245,20 +251,20 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que let op = sdset::multi::Intersection::new(results); let docids = op.into_set_buf(); - let docids: Cow> = Cow::Owned(docids); println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); - docids + Ok(docids) } - fn execute_or<'o, 'c>( - ctx: &'c Context, - cache: &mut Cache<'o, 'c>, - postings: &mut Postings<'o, 'c>, + fn execute_or<'o, 'txn>( + reader: &'txn heed::RoTxn, + pls: store::PostingsLists, + cache: &mut Cache<'o, 'txn>, + postings: &mut Postings<'o, 'txn>, depth: usize, operations: &'o [Operation], - ) -> Cow<'c, Set> + ) -> MResult> { println!("{:1$}OR", "", depth * 2); @@ -270,46 +276,47 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que Some(docids) => docids, None => { let docids = match op { - Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops), - Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops), - Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query), + Operation::And(ops) => execute_and(reader, pls, cache, postings, depth + 1, &ops)?, + Operation::Or(ops) => execute_or(reader, pls, cache, postings, depth + 1, &ops)?, + Operation::Query(query) => execute_query(reader, pls, postings, depth + 1, &query)?, }; cache.entry(op).or_insert(docids) } }; - ids.extend(docids.as_ref()); + ids.extend_from_slice(docids.as_ref()); } let docids = SetBuf::from_dirty(ids); - let docids: Cow> = Cow::Owned(docids); println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); - docids + Ok(docids) } - fn execute_query<'o, 'c>( - ctx: &'c Context, - postings: &mut Postings<'o, 'c>, + fn execute_query<'o, 'txn>( + reader: &'txn heed::RoTxn, + pls: store::PostingsLists, + postings: &mut Postings<'o, 'txn>, depth: usize, query: &'o Query, - ) -> Cow<'c, Set> + ) -> MResult> { let before = Instant::now(); let (docids, matches) = match query { Query::Tolerant(_, word) | Query::Exact(_, word) | Query::Prefix(_, word) => { - if let Some(PostingsList { docids, matches }) = ctx.postings.get(word) { - (Cow::Borrowed(docids.as_set()), Cow::Borrowed(matches.as_set())) + if let Some(docindexes) = pls.postings_list(reader, word.as_bytes())? { + let mut docids: Vec<_> = docindexes.iter().map(|d| d.document_id).collect(); + docids.dedup(); + (SetBuf::new(docids).unwrap(), docindexes) } else { - (Cow::default(), Cow::default()) + (SetBuf::default(), Cow::default()) } }, Query::Phrase(_, words) => { if let [first, second] = words.as_slice() { - let default = SetBuf::default(); - let first = ctx.postings.get(first).map(|pl| &pl.matches).unwrap_or(&default); - let second = ctx.postings.get(second).map(|pl| &pl.matches).unwrap_or(&default); + let first = pls.postings_list(reader, first.as_bytes())?.unwrap_or_default(); + let second = pls.postings_list(reader, second.as_bytes())?.unwrap_or_default(); let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| { let x = (a.document_id, a.attribute, (a.word_index as u32) + 1); @@ -327,10 +334,10 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que println!("{:2$}matches {:?}", "", matches, depth * 2); - (Cow::Owned(SetBuf::new(docids).unwrap()), Cow::Owned(SetBuf::new(matches).unwrap())) + (SetBuf::new(docids).unwrap(), Cow::Owned(SetBuf::new(matches).unwrap())) } else { println!("{:2$}{:?} skipped", "", words, depth * 2); - (Cow::default(), Cow::default()) + (SetBuf::default(), Cow::default()) } }, }; @@ -338,17 +345,17 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2); postings.insert(query, matches); - docids + Ok(docids) } let mut cache = Cache::new(); let mut postings = Postings::new(); let docids = match tree { - Operation::And(operations) => execute_and(ctx, &mut cache, &mut postings, 0, &operations), - Operation::Or(operations) => execute_or(ctx, &mut cache, &mut postings, 0, &operations), - Operation::Query(query) => execute_query(ctx, &mut postings, 0, &query), + Operation::And(ops) => execute_and(reader, postings_lists, &mut cache, &mut postings, 0, &ops)?, + Operation::Or(ops) => execute_or(reader, postings_lists, &mut cache, &mut postings, 0, &ops)?, + Operation::Query(query) => execute_query(reader, postings_lists, &mut postings, 0, &query)?, }; - QueryResult { docids, queries: postings } + Ok(QueryResult { docids, queries: postings }) }