add -v -v mode which prints out the details (potentially sensitive, potentially bulky)

remove spie, which appears to do nothing
This commit is contained in:
Zooko O'Whielacronx 2013-02-13 21:08:49 +00:00
parent 9204b2e17e
commit 503b8aead5
6 changed files with 21 additions and 12 deletions

View File

@ -13,13 +13,13 @@ if __name__ == "__main__":
import fileinput import fileinput
from StringIO import StringIO from StringIO import StringIO
verbose = False verbose = 0
while '--verbose' in sys.argv: while '--verbose' in sys.argv:
verbose = True verbose += 1
sys.argv.pop(sys.argv.index('--verbose')) sys.argv.pop(sys.argv.index('--verbose'))
while '-v' in sys.argv: while '-v' in sys.argv:
verbose = True verbose += 1
sys.argv.pop(sys.argv.index('-v')) sys.argv.pop(sys.argv.index('-v'))
import pdfparanoia import pdfparanoia

View File

@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
from .aip import * from .aip import *
from .ieee import * from .ieee import *
from .jstor import * from .jstor import *
from .spie import *

View File

@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin):
""" """
@classmethod @classmethod
def scrub(cls, content, verbose=False): def scrub(cls, content, verbose=0):
evil_ids = [] evil_ids = []
# parse the pdf into a pdfminer document # parse the pdf into a pdfminer document
@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin):
phrase="Redistribution subject to AIP license or copyright" phrase="Redistribution subject to AIP license or copyright"
if phrase in data: if phrase in data:
if verbose: if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
evil_ids.append(objid) evil_ids.append(objid)

View File

@ -15,7 +15,7 @@ class IEEEXplore(Plugin):
""" """
@classmethod @classmethod
def scrub(cls, content, verbose=False): def scrub(cls, content, verbose=0):
evil_ids = [] evil_ids = []
# parse the pdf into a pdfminer document # parse the pdf into a pdfminer document
@ -40,7 +40,9 @@ class IEEEXplore(Plugin):
phrase= "Authorized licensed use limited to: " phrase= "Authorized licensed use limited to: "
if phrase in data: if phrase in data:
if verbose: if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
evil_ids.append(objid) evil_ids.append(objid)

View File

@ -34,7 +34,7 @@ class JSTOR(Plugin):
] ]
@classmethod @classmethod
def scrub(cls, content, verbose=False): def scrub(cls, content, verbose=0):
replacements = [] replacements = []
# jstor has certain watermarks only on the first page # jstor has certain watermarks only on the first page
@ -61,13 +61,13 @@ class JSTOR(Plugin):
if all([requirement in data for requirement in JSTOR.requirements]): if all([requirement in data for requirement in JSTOR.requirements]):
better_content = data better_content = data
if verbose:
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements))
# remove the date # remove the date
startpos = better_content.find("This content downloaded ") startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos) endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos] segment = better_content[startpos:endpos]
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content.replace(segment, "") better_content = better_content.replace(segment, "")
# it looks like all of the watermarks are at the end? # it looks like all of the watermarks are at the end?
@ -85,12 +85,18 @@ class JSTOR(Plugin):
startpos = better_content.rfind("/F2 11 Tf\n") startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5) endpos = better_content.find("Tf\n", startpos+5)
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
better_content = better_content[0:startpos] + better_content[endpos:] better_content = better_content[0:startpos] + better_content[endpos:]
replacements.append([objid, better_content]) replacements.append([objid, better_content])
page_id += 1 page_id += 1
if verbose >= 1 and replacements:
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
for deets in replacements: for deets in replacements:
objid = deets[0] objid = deets[0]
replacement = deets[1] replacement = deets[1]