mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 15:05:52 +01:00
Merge pull request #25 from semorrison/master
comparediffs, a tool to download, scrub, and compare PDFs
This commit is contained in:
commit
9d26a0aa01
3
.gitignore
vendored
3
.gitignore
vendored
@ -13,3 +13,6 @@ pdfparanoia.egg-info/
|
||||
|
||||
# ignore pdfs in the top-level dir
|
||||
/*.pdf
|
||||
|
||||
# temporary pdfs in tests/diff/
|
||||
tests/diff/pdf
|
||||
|
17
tests/diff/README.md
Normal file
17
tests/diff/README.md
Normal file
@ -0,0 +1,17 @@
|
||||
`comparediffs` provides a tool which
|
||||
* downloads a PDF from two different sources,
|
||||
* runs `pdfparanoia` on both files, and
|
||||
* compares the outputs byte-for-byte.
|
||||
|
||||
Typical usage is to first establish two `ssh` tunnels to hosts with access to the literature, e.g. via
|
||||
`ssh -D 1080 host1` and `ssh -D 1081 host2`. You can then invoke `comparediffs` via
|
||||
|
||||
./comparediffs localhost:1080 localhost:1081 < urls
|
||||
|
||||
where `urls` is a file containing one URL per line (e.g. the example file in this directory).
|
||||
|
||||
`comparediffs` creates a subdirectory `pdf/`, in which is stores PDFs. It won't try to download the same PDF twice, so if you make changes to `pdfparanoia` you'll
|
||||
want to clean out some or all of this subdirectory.
|
||||
|
||||
It's easy to see which PDFs `pdfparanoia` failed on afterwards, as it leaves copies of the scrubbed files with suffixes `.1.cleaned.pdf` and `.2.cleaned.pdf`.
|
||||
When `pdfparanoia` succeeds (or isn't even needed, because the downloaded files were identical), the scrubbed files are removed.
|
69
tests/diff/comparediffs
Executable file
69
tests/diff/comparediffs
Executable file
@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" != "2" ]; then
|
||||
echo "Please supply the addresses of two SOCKS proxies:"
|
||||
echo "compare host1:port1 host2:port2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
proxy1="$1"
|
||||
proxy2="$2"
|
||||
|
||||
curl -s --socks $proxy1 http://google.com | grep "The document has moved" > /dev/null
|
||||
if [ "$?" != "0" ]; then
|
||||
echo "SOCKS proxy $proxy1 doesn't seem to be working."
|
||||
exit 2
|
||||
fi
|
||||
curl -s --socks $proxy2 http://google.com | grep "The document has moved" > /dev/null
|
||||
if [ "$?" != "0" ]; then
|
||||
echo "SOCKS proxy $proxy2 doesn't seem to be working."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
echo "--------------------------------------------------------------------------------------------------------------------------------"
|
||||
echo "Comparing PDFs downloaded via '$proxy1' and '$proxy2'"
|
||||
echo "Please enter URLs, one per line."
|
||||
echo
|
||||
echo "PDFs which still differ after pdfparanoia will be left in the pdf/ subdirectory, with suffixes .1.cleaned.pdf and .2.cleaned.pdf"
|
||||
echo "--------------------------------------------------------------------------------------------------------------------------------"
|
||||
echo
|
||||
|
||||
mkdir -p pdf
|
||||
|
||||
while read url; do
|
||||
hash=`echo $url | openssl sha1 | cut -d ' ' -f 2`
|
||||
echo "Retrieving $url, with hash $hash";
|
||||
if [ ! -s pdf/$hash.1.pdf ]; then
|
||||
curl --socks $proxy1 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.1.pdf
|
||||
fi
|
||||
if [ "`head -c 4 pdf/$hash.1.pdf`" == "%PDF" ]; then
|
||||
if [ ! -s pdf/$hash.2.pdf ]; then
|
||||
curl --socks $proxy2 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.2.pdf
|
||||
fi
|
||||
if [ "`head -c 4 pdf/$hash.2.pdf`" == "%PDF" ]; then
|
||||
if cmp -s pdf/$hash.1.pdf pdf/$hash.2.pdf; then
|
||||
echo "PDFs are identical already, no need to use pdfparanoia"
|
||||
else
|
||||
echo "PDFs differ, running pdfparanoia"
|
||||
if [ ! -s pdf/$hash.1.cleaned.pdf ]; then
|
||||
cat pdf/$hash.1.pdf | pdfparanoia > pdf/$hash.1.cleaned.pdf
|
||||
fi
|
||||
if [ ! -s pdf/$hash.2.cleaned.pdf ]; then
|
||||
cat pdf/$hash.2.pdf | pdfparanoia > pdf/$hash.2.cleaned.pdf
|
||||
fi
|
||||
if cmp -s pdf/$hash.1.cleaned.pdf pdf/$hash.2.cleaned.pdf; then
|
||||
echo "pdfparanoia successful scrubbed the PDFs"
|
||||
rm pdf/$hash.*.cleaned.pdf
|
||||
else
|
||||
echo "pdfparanoia failed!"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "Download failed from source 2"
|
||||
rm pdf/$hash.*.pdf
|
||||
fi
|
||||
else
|
||||
echo "Download failed from source 1"
|
||||
rm pdf/$hash.*.pdf
|
||||
fi
|
||||
echo
|
||||
done
|
20
tests/diff/urls
Normal file
20
tests/diff/urls
Normal file
@ -0,0 +1,20 @@
|
||||
http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf
|
||||
http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf
|
||||
http://www.ams.org/journals/bull/2010-47-04/S0273-0979-10-01296-6/S0273-0979-10-01296-6.pdf
|
||||
http://nyjm.albany.edu/j/2009/15-14p.pdf
|
||||
http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9
|
||||
http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018
|
||||
http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf
|
||||
http://imrn.oxfordjournals.org/content/2002/4/165.full.pdf
|
||||
http://www.tandfonline.com/doi/pdf/10.2991/jnmp.2006.13.3.2
|
||||
http://www.jstor.org/stable/pdfplus/121105?acceptTC=true
|
||||
http://muse.jhu.edu/journals/american_journal_of_mathematics/v118/118.5bernstein.pdf
|
||||
http://archive.numdam.org/article/CM_1980__41_2_245_0.pdf
|
||||
http://www.pnas.org/content/102/26/9099.full.pdf
|
||||
http://iopscience.iop.org/1126-6708/2004/06/043/pdf/1126-6708_2004_06_043.pdf
|
||||
http://scitation.aip.org/getpdf/servlet/GetPDFServlet?filetype=pdf&id=JMAPAQ000042000007002896000001&idtype=cvips&doi=10.1063/1.1372177&prog=normal
|
||||
http://msp.org/gt/2009/13-03/gt-2009-13-033s.pdf
|
||||
http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf
|
||||
http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6
|
||||
http://www.ems-ph.org/journals/show_pdf.php?issn=0010-2571&vol=88&iss=1&rank=3
|
||||
http://prd.aps.org/pdf/PRD/v13/i12/p3491_1
|
Loading…
Reference in New Issue
Block a user