mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
add a "--verbose" option that writes to stderr if it finds anything to omit
Also cleaned up some flakes noticed by pyflakes, and make the scrub() be @classmethod instead of @staticmethod so I could use the class for the verbose output. caveats: * there are no unit tests of this patch * now your logs of your stderr have potentially sensitive information in them * the implementation of arg parsing is very low-tech; (a *good* way to do arg parsing is the "argparse" module)
This commit is contained in:
parent
caed396870
commit
56cc7719da
@ -13,6 +13,15 @@ if __name__ == "__main__":
|
||||
import fileinput
|
||||
from StringIO import StringIO
|
||||
|
||||
verbose = False
|
||||
while '--verbose' in sys.argv:
|
||||
verbose = True
|
||||
sys.argv.pop(sys.argv.index('--verbose'))
|
||||
|
||||
while '-v' in sys.argv:
|
||||
verbose = True
|
||||
sys.argv.pop(sys.argv.index('-v'))
|
||||
|
||||
import pdfparanoia
|
||||
|
||||
# read in all lines
|
||||
@ -21,7 +30,7 @@ if __name__ == "__main__":
|
||||
content += line
|
||||
|
||||
# scrub the pdf to get rid of watermarks
|
||||
output = pdfparanoia.scrub(StringIO(content))
|
||||
output = pdfparanoia.scrub(StringIO(content), verbose=verbose)
|
||||
|
||||
# dump to output
|
||||
sys.stdout.write(output)
|
||||
|
@ -32,7 +32,7 @@ def find_plugins():
|
||||
plugins = [each[1] for each in plugins]
|
||||
return plugins
|
||||
|
||||
def scrub(obj):
|
||||
def scrub(obj, verbose=False):
|
||||
"""
|
||||
Removes watermarks from a pdf and returns the resulting pdf as a string.
|
||||
"""
|
||||
@ -50,7 +50,7 @@ def scrub(obj):
|
||||
|
||||
# clean this pdf as much as possible
|
||||
for plugin in plugins:
|
||||
content = plugin.scrub(content)
|
||||
content = plugin.scrub(content, verbose=verbose)
|
||||
|
||||
return content
|
||||
|
||||
|
@ -8,8 +8,8 @@ Defines how plugins work.
|
||||
"""
|
||||
|
||||
class Plugin:
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=False):
|
||||
"""
|
||||
Removes watermarks from the given pdf.
|
||||
"""
|
||||
|
@ -1,5 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..parser import parse_content
|
||||
@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||
attached for whatever reason.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=False):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
@ -43,6 +45,9 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Redistribution subject to AIP license or copyright" in data:
|
||||
if verbose:
|
||||
sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
|
||||
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
|
@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
@ -13,8 +14,8 @@ class IEEEXplore(Plugin):
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=False):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
@ -38,6 +39,9 @@ class IEEEXplore(Plugin):
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Authorized licensed use limited to: " in data:
|
||||
if verbose:
|
||||
sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, data,))
|
||||
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
|
@ -2,9 +2,10 @@
|
||||
|
||||
from copy import copy
|
||||
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import (
|
||||
remove_object_by_id,
|
||||
replace_object_with,
|
||||
)
|
||||
from ..plugin import Plugin
|
||||
@ -32,8 +33,8 @@ class JSTOR(Plugin):
|
||||
"This content downloaded on",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=False):
|
||||
replacements = []
|
||||
|
||||
# jstor has certain watermarks only on the first page
|
||||
@ -54,8 +55,6 @@ class JSTOR(Plugin):
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
length = obj.attrs["Length"]
|
||||
rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
# make sure all of the requirements are in there
|
||||
@ -82,6 +81,10 @@ class JSTOR(Plugin):
|
||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||
endpos = better_content.find("Tf\n", startpos+5)
|
||||
|
||||
if verbose:
|
||||
sys.stderr.write("%s: Found object with %r; omitting..." % (cls.__name__, better_content[startpos:endpos],))
|
||||
|
||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||
|
||||
replacements.append([objid, better_content])
|
||||
|
@ -1,9 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class SPIE(Plugin):
|
||||
@ -18,8 +18,8 @@ class SPIE(Plugin):
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=False):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
@ -41,6 +41,8 @@ class SPIE(Plugin):
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Downloaded From:" in data:
|
||||
if verbose:
|
||||
sys.stderr.write("%s: found object with %r; omitting..." % (cls.__name__, data))
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
|
Loading…
Reference in New Issue
Block a user