mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-06-10 04:39:51 +02:00
add -v -v mode which prints out the details (potentially sensitive, potentially bulky)
remove spie, which appears to do nothing
This commit is contained in:
parent
9204b2e17e
commit
503b8aead5
|
@ -13,13 +13,13 @@ if __name__ == "__main__":
|
||||||
import fileinput
|
import fileinput
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
verbose = False
|
verbose = 0
|
||||||
while '--verbose' in sys.argv:
|
while '--verbose' in sys.argv:
|
||||||
verbose = True
|
verbose += 1
|
||||||
sys.argv.pop(sys.argv.index('--verbose'))
|
sys.argv.pop(sys.argv.index('--verbose'))
|
||||||
|
|
||||||
while '-v' in sys.argv:
|
while '-v' in sys.argv:
|
||||||
verbose = True
|
verbose += 1
|
||||||
sys.argv.pop(sys.argv.index('-v'))
|
sys.argv.pop(sys.argv.index('-v'))
|
||||||
|
|
||||||
import pdfparanoia
|
import pdfparanoia
|
||||||
|
|
|
@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
|
||||||
from .aip import *
|
from .aip import *
|
||||||
from .ieee import *
|
from .ieee import *
|
||||||
from .jstor import *
|
from .jstor import *
|
||||||
from .spie import *
|
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrub(cls, content, verbose=False):
|
def scrub(cls, content, verbose=0):
|
||||||
evil_ids = []
|
evil_ids = []
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# parse the pdf into a pdfminer document
|
||||||
|
@ -46,7 +46,9 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||||
|
|
||||||
phrase="Redistribution subject to AIP license or copyright"
|
phrase="Redistribution subject to AIP license or copyright"
|
||||||
if phrase in data:
|
if phrase in data:
|
||||||
if verbose:
|
if verbose >= 2:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
||||||
|
elif verbose >= 1:
|
||||||
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
||||||
|
|
||||||
evil_ids.append(objid)
|
evil_ids.append(objid)
|
||||||
|
|
|
@ -15,7 +15,7 @@ class IEEEXplore(Plugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrub(cls, content, verbose=False):
|
def scrub(cls, content, verbose=0):
|
||||||
evil_ids = []
|
evil_ids = []
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# parse the pdf into a pdfminer document
|
||||||
|
@ -40,7 +40,9 @@ class IEEEXplore(Plugin):
|
||||||
|
|
||||||
phrase= "Authorized licensed use limited to: "
|
phrase= "Authorized licensed use limited to: "
|
||||||
if phrase in data:
|
if phrase in data:
|
||||||
if verbose:
|
if verbose >= 2:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||||
|
elif verbose >= 1:
|
||||||
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
||||||
|
|
||||||
evil_ids.append(objid)
|
evil_ids.append(objid)
|
||||||
|
|
|
@ -34,7 +34,7 @@ class JSTOR(Plugin):
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrub(cls, content, verbose=False):
|
def scrub(cls, content, verbose=0):
|
||||||
replacements = []
|
replacements = []
|
||||||
|
|
||||||
# jstor has certain watermarks only on the first page
|
# jstor has certain watermarks only on the first page
|
||||||
|
@ -61,13 +61,13 @@ class JSTOR(Plugin):
|
||||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||||
better_content = data
|
better_content = data
|
||||||
|
|
||||||
if verbose:
|
|
||||||
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, cls.requirements))
|
|
||||||
|
|
||||||
# remove the date
|
# remove the date
|
||||||
startpos = better_content.find("This content downloaded ")
|
startpos = better_content.find("This content downloaded ")
|
||||||
endpos = better_content.find(")", startpos)
|
endpos = better_content.find(")", startpos)
|
||||||
segment = better_content[startpos:endpos]
|
segment = better_content[startpos:endpos]
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||||
|
|
||||||
better_content = better_content.replace(segment, "")
|
better_content = better_content.replace(segment, "")
|
||||||
|
|
||||||
# it looks like all of the watermarks are at the end?
|
# it looks like all of the watermarks are at the end?
|
||||||
|
@ -85,12 +85,18 @@ class JSTOR(Plugin):
|
||||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||||
endpos = better_content.find("Tf\n", startpos+5)
|
endpos = better_content.find("Tf\n", startpos+5)
|
||||||
|
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||||
|
|
||||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||||
|
|
||||||
replacements.append([objid, better_content])
|
replacements.append([objid, better_content])
|
||||||
|
|
||||||
page_id += 1
|
page_id += 1
|
||||||
|
|
||||||
|
if verbose >= 1 and replacements:
|
||||||
|
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||||
|
|
||||||
for deets in replacements:
|
for deets in replacements:
|
||||||
objid = deets[0]
|
objid = deets[0]
|
||||||
replacement = deets[1]
|
replacement = deets[1]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user