mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
comparediffs seems to be working nicely
This commit is contained in:
parent
54b6ab070a
commit
2fb3783dea
2
.gitignore
vendored
2
.gitignore
vendored
@ -15,4 +15,4 @@ pdfparanoia.egg-info/
|
|||||||
/*.pdf
|
/*.pdf
|
||||||
|
|
||||||
# temporary pdfs in tests/diff/
|
# temporary pdfs in tests/diff/
|
||||||
tests/diff/*.pdf
|
tests/diff/pdf
|
||||||
|
@ -1,23 +1,68 @@
|
|||||||
for url in `cat urls`; do
|
if [ "$#" != "2" ]; then
|
||||||
echo "Retrieving $url";
|
echo "Please supply the addresses of two SOCKS proxies:"
|
||||||
# TODO allow customizing the proxies, or more generally the retrieval mechanisms
|
echo "compare host1:port1 host2:port2"
|
||||||
# TODO allow local caching
|
exit 1
|
||||||
curl --socks localhost:1080 $url > 1.pdf
|
fi
|
||||||
curl --socks localhost:1083 $url > 2.pdf
|
|
||||||
# TODO verify that we actually obtained pdfs
|
proxy1="$1"
|
||||||
if diff 1.pdf 2.pdf; then
|
proxy2="$2"
|
||||||
echo "PDFs are identical already, no need to use pdfparanoia"
|
|
||||||
else
|
curl -s --socks $proxy1 http://google.com | grep "The document has moved" > /dev/null
|
||||||
cat 1.pdf | pdfparanoia > 1.cleaned.pdf
|
if [ "$?" != "0" ]; then
|
||||||
cat 2.pdf | pdfparanoia > 2.cleaned.pdf
|
echo "SOCKS proxy $proxy1 doesn't seem to be working."
|
||||||
if diff 1.cleaned.pdf 2.cleaned.pdf; then
|
exit 2
|
||||||
echo "pdfparanoia successful scrubbed the PDFs"
|
fi
|
||||||
else
|
curl -s --socks $proxy2 http://google.com | grep "The document has moved" > /dev/null
|
||||||
echo "pdfparanoia failed!"
|
if [ "$?" != "0" ]; then
|
||||||
fi
|
echo "SOCKS proxy $proxy2 doesn't seem to be working."
|
||||||
rm 1.cleaned.pdf
|
exit 3
|
||||||
rm 2.cleaned.pdf
|
fi
|
||||||
|
|
||||||
|
echo "--------------------------------------------------------------------------------------------------------------------------------"
|
||||||
|
echo "Comparing PDFs downloaded via '$proxy1' and '$proxy2'"
|
||||||
|
echo "Please enter URLs, one per line."
|
||||||
|
echo
|
||||||
|
echo "PDFs which still differ after pdfparanoia will be left in the pdf/ subdirectory, with suffixes .1.cleaned.pdf and .2.cleaned.pdf"
|
||||||
|
echo "--------------------------------------------------------------------------------------------------------------------------------"
|
||||||
|
echo
|
||||||
|
|
||||||
|
mkdir -p pdf
|
||||||
|
|
||||||
|
while read url; do
|
||||||
|
hash=`echo $url | openssl sha1 | cut -d ' ' -f 2`
|
||||||
|
echo "Retrieving $url, with hash $hash";
|
||||||
|
if [ ! -s pdf/$hash.1.pdf ]; then
|
||||||
|
curl --socks $proxy1 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.1.pdf
|
||||||
fi
|
fi
|
||||||
rm 1.pdf
|
if [ "`head -c 4 pdf/$hash.1.pdf`" == "%PDF" ]; then
|
||||||
rm 2.pdf
|
if [ ! -s pdf/$hash.2.pdf ]; then
|
||||||
|
curl --socks $proxy2 -L --cookie cookie.jar -A "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" $url > pdf/$hash.2.pdf
|
||||||
|
fi
|
||||||
|
if [ "`head -c 4 pdf/$hash.2.pdf`" == "%PDF" ]; then
|
||||||
|
if cmp -s pdf/$hash.1.pdf pdf/$hash.2.pdf; then
|
||||||
|
echo "PDFs are identical already, no need to use pdfparanoia"
|
||||||
|
else
|
||||||
|
echo "PDFs differ, running pdfparanoia"
|
||||||
|
if [ ! -s pdf/$hash.1.cleaned.pdf ]; then
|
||||||
|
cat pdf/$hash.1.pdf | pdfparanoia > pdf/$hash.1.cleaned.pdf
|
||||||
|
fi
|
||||||
|
if [ ! -s pdf/$hash.2.cleaned.pdf ]; then
|
||||||
|
cat pdf/$hash.2.pdf | pdfparanoia > pdf/$hash.2.cleaned.pdf
|
||||||
|
fi
|
||||||
|
if cmp -s pdf/$hash.1.cleaned.pdf pdf/$hash.2.cleaned.pdf; then
|
||||||
|
echo "pdfparanoia successful scrubbed the PDFs"
|
||||||
|
rm pdf/$hash.*.cleaned.pdf
|
||||||
|
else
|
||||||
|
echo "pdfparanoia failed!"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Download failed from source 2"
|
||||||
|
rm pdf/$hash.*.pdf
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Download failed from source 1"
|
||||||
|
rm pdf/$hash.*.pdf
|
||||||
|
fi
|
||||||
|
echo
|
||||||
done
|
done
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9
|
|
||||||
http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf
|
|
||||||
http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf
|
http://annals.math.princeton.edu/wp-content/uploads/annals-v176-n2-p11-s.pdf
|
||||||
|
http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf
|
||||||
|
http://nyjm.albany.edu/j/2009/15-14p.pdf
|
||||||
|
http://link.springer.com/content/pdf/10.1007/s00440-011-0397-9
|
||||||
http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018
|
http://www.worldscientific.com/doi/pdf/10.1142/S2010326311500018
|
||||||
http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf
|
http://www.sciencedirect.com/science?_ob=MiamiImageURL&_cid=272585&_user=994540&_pii=S0001870812001806&_check=y&_origin=article&_zone=toolbar&_coverDate=10-Sep-2012&view=c&originContentFamily=serial&wchp=dGLzVlt-zSkWb&md5=bfeb5e0619d45362640529aff02baeda&pid=1-s2.0-S0001870812001806-main.pdf
|
||||||
http://www.ams.org/journals/mcom/2012-81-278/S0025-5718-2011-02542-1/S0025-5718-2011-02542-1.pdf
|
|
||||||
http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6
|
|
||||||
http://nyjm.albany.edu/j/2009/15-14p.pdf
|
|
||||||
|
2
tests/diff/urls.denied
Normal file
2
tests/diff/urls.denied
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
http://msp.org/apde/2012/5-2/apde-v5-n2-p07-s.pdf
|
||||||
|
http://www.ems-ph.org/journals/show_pdf.php?issn=1661-7207&vol=5&iss=2&rank=6
|
Loading…
Reference in New Issue
Block a user