From 54b6ab070ae13fe2fc09b0313b0e3f7243b843d6 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 20:06:18 +1000 Subject: [PATCH 1/7] initial attempt to pairwise diff testing --- .gitignore | 3 +++ tests/diff/comparediffs | 23 +++++++++++++++++++++++ tests/diff/urls | 8 ++++++++ 3 files changed, 34 insertions(+) create mode 100755 tests/diff/comparediffs create mode 100644 tests/diff/urls diff --git a/.gitignore b/.gitignore index 4278ff0..66b8e49 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ pdfparanoia.egg-info/ # ignore pdfs in the top-level dir /*.pdf + +# temporary pdfs in tests/diff/ +tests/diff/*.pdf diff --git a/tests/diff/comparediffs b/tests/diff/comparediffs new file mode 100755 index 0000000..f49bd51 --- /dev/null +++ b/tests/diff/comparediffs @@ -0,0 +1,23 @@ +for url in `cat urls`; do + echo "Retrieving $url"; + # TODO allow customizing the proxies, or more generally the retrieval mechanisms + # TODO allow local caching + curl --socks localhost:1080 $url > 1.pdf + curl --socks localhost:1083 $url > 2.pdf + # TODO verify that we actually obtained pdfs + if diff 1.pdf 2.pdf; then + echo "PDFs are identical already, no need to use pdfparanoia" + else + cat 1.pdf | pdfparanoia > 1.cleaned.pdf + cat 2.pdf | pdfparanoia > 2.cleaned.pdf + if diff 1.cleaned.pdf 2.cleaned.pdf; then + echo "pdfparanoia successful scrubbed the PDFs" + else + echo "pdfparanoia failed!" + fi + rm 1.cleaned.pdf + rm 2.cleaned.pdf + fi + rm 1.pdf + rm 2.pdf +done diff --git a/tests/diff/urls b/tests/diff/urls new file mode 100644 index 0000000..0900fbc --- /dev/null +++ b/tests/diff/urls @@ -0,0 +1,8 @@ +http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9 +http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf +http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf +http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018 +http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf +http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf +http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 +http://nyjm.albany.edu/j/2009/15-14p.pdf From 2fb3783dea0935158b79bb3ff5887aca39c04d15 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 22:45:34 +1000 Subject: [PATCH 2/7] comparediffs seems to be working nicely --- .gitignore | 2 +- tests/diff/comparediffs | 87 +++++++++++++++++++++++++++++++---------- tests/diff/urls | 8 ++-- tests/diff/urls.denied | 2 + 4 files changed, 72 insertions(+), 27 deletions(-) create mode 100644 tests/diff/urls.denied diff --git a/.gitignore b/.gitignore index 66b8e49..d14f25c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,4 @@ pdfparanoia.egg-info/ /*.pdf # temporary pdfs in tests/diff/ -tests/diff/*.pdf +tests/diff/pdf diff --git a/tests/diff/comparediffs b/tests/diff/comparediffs index f49bd51..0a5c3c4 100755 --- a/tests/diff/comparediffs +++ b/tests/diff/comparediffs @@ -1,23 +1,68 @@ -for url in `cat urls`; do - echo "Retrieving $url"; - # TODO allow customizing the proxies, or more generally the retrieval mechanisms - # TODO allow local caching - curl --socks localhost:1080 $url > 1.pdf - curl --socks localhost:1083 $url > 2.pdf - # TODO verify that we actually obtained pdfs - if diff 1.pdf 2.pdf; then - echo "PDFs are identical already, no need to use pdfparanoia" - else - cat 1.pdf | pdfparanoia > 1.cleaned.pdf - cat 2.pdf | pdfparanoia > 2.cleaned.pdf - if diff 1.cleaned.pdf 2.cleaned.pdf; then - echo "pdfparanoia successful scrubbed the PDFs" - else - echo "pdfparanoia failed!" - fi - rm 1.cleaned.pdf - rm 2.cleaned.pdf +if [ "$#" != "2" ]; then + echo "Please supply the addresses of two SOCKS proxies:" + echo "compare host1:port1 host2:port2" + exit 1 +fi + +proxy1="$1" +proxy2="$2" + +curl -s --socks $proxy1 http://google.com | grep "The document has moved" > /dev/null +if [ "$?" != "0" ]; then + echo "SOCKS proxy $proxy1 doesn't seem to be working." + exit 2 +fi +curl -s --socks $proxy2 http://google.com | grep "The document has moved" > /dev/null +if [ "$?" != "0" ]; then + echo "SOCKS proxy $proxy2 doesn't seem to be working." + exit 3 +fi + +echo "--------------------------------------------------------------------------------------------------------------------------------" +echo "Comparing PDFs downloaded via '$proxy1' and '$proxy2'" +echo "Please enter URLs, one per line." +echo +echo "PDFs which still differ after pdfparanoia will be left in the pdf/ subdirectory, with suffixes .1.cleaned.pdf and .2.cleaned.pdf" +echo "--------------------------------------------------------------------------------------------------------------------------------" +echo + +mkdir -p pdf + +while read url; do + hash=`echo $url | openssl sha1 | cut -d ' ' -f 2` + echo "Retrieving $url, with hash $hash"; + if [ ! -s pdf/$hash.1.pdf ]; then + curl --socks $proxy1 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.1.pdf fi - rm 1.pdf - rm 2.pdf + if [ "`head -c 4 pdf/$hash.1.pdf`" == "%PDF" ]; then + if [ ! -s pdf/$hash.2.pdf ]; then + curl --socks $proxy2 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.2.pdf + fi + if [ "`head -c 4 pdf/$hash.2.pdf`" == "%PDF" ]; then + if cmp -s pdf/$hash.1.pdf pdf/$hash.2.pdf; then + echo "PDFs are identical already, no need to use pdfparanoia" + else + echo "PDFs differ, running pdfparanoia" + if [ ! -s pdf/$hash.1.cleaned.pdf ]; then + cat pdf/$hash.1.pdf | pdfparanoia > pdf/$hash.1.cleaned.pdf + fi + if [ ! -s pdf/$hash.2.cleaned.pdf ]; then + cat pdf/$hash.2.pdf | pdfparanoia > pdf/$hash.2.cleaned.pdf + fi + if cmp -s pdf/$hash.1.cleaned.pdf pdf/$hash.2.cleaned.pdf; then + echo "pdfparanoia successful scrubbed the PDFs" + rm pdf/$hash.*.cleaned.pdf + else + echo "pdfparanoia failed!" + fi + fi + else + echo "Download failed from source 2" + rm pdf/$hash.*.pdf + fi + else + echo "Download failed from source 1" + rm pdf/$hash.*.pdf + fi + echo done diff --git a/tests/diff/urls b/tests/diff/urls index 0900fbc..630a24e 100644 --- a/tests/diff/urls +++ b/tests/diff/urls @@ -1,8 +1,6 @@ -http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9 -http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf +http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf +http://nyjm.albany.edu/j/2009/15-14p.pdf +http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9 http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018 http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf -http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf -http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 -http://nyjm.albany.edu/j/2009/15-14p.pdf diff --git a/tests/diff/urls.denied b/tests/diff/urls.denied new file mode 100644 index 0000000..4736b80 --- /dev/null +++ b/tests/diff/urls.denied @@ -0,0 +1,2 @@ +http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf +http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 From 27ad74686152607ceec39efd3740f42d149f6705 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 23:16:15 +1000 Subject: [PATCH 3/7] adding a few more URLs for testing --- tests/diff/urls | 10 ++++++++++ tests/diff/urls.denied | 2 ++ 2 files changed, 12 insertions(+) diff --git a/tests/diff/urls b/tests/diff/urls index 630a24e..00c0daf 100644 --- a/tests/diff/urls +++ b/tests/diff/urls @@ -1,6 +1,16 @@ http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf +http://www.ams.org/journals/bull/2010-47-04/S0273-0979-10-01296-6/S0273-0979-10-01296-6.pdf http://nyjm.albany.edu/j/2009/15-14p.pdf http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9 http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018 http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf +http://imrn.oxfordjournals.org/content/2002/4/165.full.pdf +http://www.tandfonline.com/doi/pdf/10.2991/jnmp.2006.13.3.2 +http://www.jstor.org/stable/pdfplus/121105?acceptTC=true +http://muse.jhu.edu/journals/american_journal_of_mathematics/v118/118.5bernstein.pdf +http://archive.numdam.org/article/CM_1980__41_2_245_0.pdf +http://www.pnas.org/content/102/26/9099.full.pdf +http://iopscience.iop.org/1126-6708/2004/06/043/pdf/1126-6708_2004_06_043.pdf +http://scitation.aip.org/getpdf/servlet/GetPDFServlet?filetype=pdf&id=JMAPAQ000042000007002896000001&idtype=cvips&doi=10.1063/1.1372177&prog=normal +http://msp.org/gt/2009/13-03/gt-2009-13-033s.pdf diff --git a/tests/diff/urls.denied b/tests/diff/urls.denied index 4736b80..1fd8994 100644 --- a/tests/diff/urls.denied +++ b/tests/diff/urls.denied @@ -1,2 +1,4 @@ http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 +http://www.ems-ph.org/journals/show_pdf.php?issn=0010-2571&vol=88&iss=1&rank=3 +http://prd.aps.org/pdf/PRD/v13/i12/p3491_1 From 74649a1a05b4683b72feb9d0158a46ecf966a22c Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 23:18:11 +1000 Subject: [PATCH 4/7] merging urls.denied back into urls --- tests/diff/urls | 4 ++++ tests/diff/urls.denied | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 tests/diff/urls.denied diff --git a/tests/diff/urls b/tests/diff/urls index 00c0daf..cba3b6c 100644 --- a/tests/diff/urls +++ b/tests/diff/urls @@ -14,3 +14,7 @@ http://www.pnas.org/content/102/26/9099.full.pdf http://iopscience.iop.org/1126-6708/2004/06/043/pdf/1126-6708_2004_06_043.pdf http://scitation.aip.org/getpdf/servlet/GetPDFServlet?filetype=pdf&id=JMAPAQ000042000007002896000001&idtype=cvips&doi=10.1063/1.1372177&prog=normal http://msp.org/gt/2009/13-03/gt-2009-13-033s.pdf +http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf +http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 +http://www.ems-ph.org/journals/show_pdf.php?issn=0010-2571&vol=88&iss=1&rank=3 +http://prd.aps.org/pdf/PRD/v13/i12/p3491_1 diff --git a/tests/diff/urls.denied b/tests/diff/urls.denied deleted file mode 100644 index 1fd8994..0000000 --- a/tests/diff/urls.denied +++ /dev/null @@ -1,4 +0,0 @@ -http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf -http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6 -http://www.ems-ph.org/journals/show_pdf.php?issn=0010-2571&vol=88&iss=1&rank=3 -http://prd.aps.org/pdf/PRD/v13/i12/p3491_1 From 702f2e2895be6bb801fdb5385e88c4b591b736d2 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 23:25:10 +1000 Subject: [PATCH 5/7] adding README for tests/diff/ --- tests/diff/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/diff/README.md diff --git a/tests/diff/README.md b/tests/diff/README.md new file mode 100644 index 0000000..34c2210 --- /dev/null +++ b/tests/diff/README.md @@ -0,0 +1,15 @@ +`comparediffs` provides a tools for downloading a PDF from two different sources, running pdfparanoia on the files, comparing the outputs byte-for-byte, +and reporting the results. + +Typical usage is to first establish two `ssh` tunnels to hosts with access to the literature, e.g. via +`ssh -D 1080 host1` and `ssh -D 1081 host2`. You can then invoke `comparediffs` via + + ./comparediffs localhost:1080 localhost:1081 < urls + +where urls is a file containing one URL per line (e.g. the example file in this directory). + +`comparediffs` creates a subdirectory `pdf/`, in which is stores PDFs. It won't try to download the same PDF twice, so if you fix pdfparanoia you'll +need to clean out some or all of this subdirectory. + +It's easy to see which PDFs pdfparanoia failed on, as it leaves copies of the scrubbed files with suffixes `.1.cleaned.pdf` and `.2.cleaned.pdf`. +When pdfparanoia succeeds (or isn't even needed, because the downloaded files were identical), the scrubbed files are removed. From 11b59bd544a2c9bdaa913d5a37fa678a486d2838 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 23:27:49 +1000 Subject: [PATCH 6/7] fixing README --- tests/diff/README.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/diff/README.md b/tests/diff/README.md index 34c2210..63bfa10 100644 --- a/tests/diff/README.md +++ b/tests/diff/README.md @@ -1,15 +1,17 @@ -`comparediffs` provides a tools for downloading a PDF from two different sources, running pdfparanoia on the files, comparing the outputs byte-for-byte, -and reporting the results. +`comparediffs` provides a tool which +* downloads a PDF from two different sources, +* runs `pdfparanoia` on both files, and +* compares the outputs byte-for-byte. Typical usage is to first establish two `ssh` tunnels to hosts with access to the literature, e.g. via `ssh -D 1080 host1` and `ssh -D 1081 host2`. You can then invoke `comparediffs` via ./comparediffs localhost:1080 localhost:1081 < urls -where urls is a file containing one URL per line (e.g. the example file in this directory). +where `urls` is a file containing one URL per line (e.g. the example file in this directory). -`comparediffs` creates a subdirectory `pdf/`, in which is stores PDFs. It won't try to download the same PDF twice, so if you fix pdfparanoia you'll -need to clean out some or all of this subdirectory. +`comparediffs` creates a subdirectory `pdf/`, in which is stores PDFs. It won't try to download the same PDF twice, so if you make changes to `pdfparanoia` you'll +want to clean out some or all of this subdirectory. -It's easy to see which PDFs pdfparanoia failed on, as it leaves copies of the scrubbed files with suffixes `.1.cleaned.pdf` and `.2.cleaned.pdf`. -When pdfparanoia succeeds (or isn't even needed, because the downloaded files were identical), the scrubbed files are removed. +It's easy to see which PDFs `pdfparanoia` failed on afterwards, as it leaves copies of the scrubbed files with suffixes `.1.cleaned.pdf` and `.2.cleaned.pdf`. +When `pdfparanoia` succeeds (or isn't even needed, because the downloaded files were identical), the scrubbed files are removed. From 2ec1ca21a64b1c7283a8f77c3079039a94f51716 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Thu, 2 May 2013 23:29:07 +1000 Subject: [PATCH 7/7] hash bang --- tests/diff/comparediffs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/diff/comparediffs b/tests/diff/comparediffs index 0a5c3c4..358530b 100755 --- a/tests/diff/comparediffs +++ b/tests/diff/comparediffs @@ -1,3 +1,4 @@ +#!/bin/bash if [ "$#" != "2" ]; then echo "Please supply the addresses of two SOCKS proxies:" echo "compare host1:port1 host2:port2"