Merge pull request #21 from zooko/verbose-option

Verbosity argument.
This commit is contained in:
Bryan Bishop 2013-02-14 01:39:19 -08:00
commit ee483ab986
8 changed files with 56 additions and 21 deletions

View File

@ -13,6 +13,15 @@ if __name__ == "__main__":
import fileinput import fileinput
from StringIO import StringIO from StringIO import StringIO
verbose = 0
while '--verbose' in sys.argv:
verbose += 1
sys.argv.pop(sys.argv.index('--verbose'))
while '-v' in sys.argv:
verbose += 1
sys.argv.pop(sys.argv.index('-v'))
import pdfparanoia import pdfparanoia
# read in all lines # read in all lines
@ -21,7 +30,7 @@ if __name__ == "__main__":
content += line content += line
# scrub the pdf to get rid of watermarks # scrub the pdf to get rid of watermarks
output = pdfparanoia.scrub(StringIO(content)) output = pdfparanoia.scrub(StringIO(content), verbose=verbose)
# dump to output # dump to output
sys.stdout.write(output) sys.stdout.write(output)

View File

@ -32,7 +32,7 @@ def find_plugins():
plugins = [each[1] for each in plugins] plugins = [each[1] for each in plugins]
return plugins return plugins
def scrub(obj): def scrub(obj, verbose=False):
""" """
Removes watermarks from a pdf and returns the resulting pdf as a string. Removes watermarks from a pdf and returns the resulting pdf as a string.
""" """
@ -50,7 +50,7 @@ def scrub(obj):
# clean this pdf as much as possible # clean this pdf as much as possible
for plugin in plugins: for plugin in plugins:
content = plugin.scrub(content) content = plugin.scrub(content, verbose=verbose)
return content return content

View File

@ -8,8 +8,8 @@ Defines how plugins work.
""" """
class Plugin: class Plugin:
@staticmethod @classmethod
def scrub(content): def scrub(cls, content, verbose=False):
""" """
Removes watermarks from the given pdf. Removes watermarks from the given pdf.
""" """

View File

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from copy import copy from copy import copy
import sys
from ..parser import parse_content from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin from ..plugin import Plugin
class SPIE(Plugin): class SPIE(Plugin):
@ -18,8 +18,8 @@ class SPIE(Plugin):
""" """
@staticmethod @classmethod
def scrub(content): def scrub(cls, content, verbose=False):
evil_ids = [] evil_ids = []
# parse the pdf into a pdfminer document # parse the pdf into a pdfminer document
@ -40,7 +40,10 @@ class SPIE(Plugin):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
data = copy(obj.get_data()) data = copy(obj.get_data())
if "Downloaded From:" in data: phrase="Downloaded From:"
if phrase in data:
if verbose:
sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase))
evil_ids.append(objid) evil_ids.append(objid)
for objid in evil_ids: for objid in evil_ids:

View File

@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
from .aip import * from .aip import *
from .ieee import * from .ieee import *
from .jstor import * from .jstor import *
from .spie import *

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys
from copy import copy from copy import copy
from ..parser import parse_content from ..parser import parse_content
@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin):
attached for whatever reason. attached for whatever reason.
""" """
@staticmethod @classmethod
def scrub(content): def scrub(cls, content, verbose=0):
evil_ids = [] evil_ids = []
# parse the pdf into a pdfminer document # parse the pdf into a pdfminer document
@ -42,7 +44,13 @@ class AmericanInstituteOfPhysics(Plugin):
#rawdata = copy(obj.rawdata) #rawdata = copy(obj.rawdata)
data = copy(obj.get_data()) data = copy(obj.get_data())
if "Redistribution subject to AIP license or copyright" in data: phrase="Redistribution subject to AIP license or copyright"
if phrase in data:
if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
evil_ids.append(objid) evil_ids.append(objid)
for objid in evil_ids: for objid in evil_ids:

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from copy import copy from copy import copy
import sys
from ..parser import parse_content from ..parser import parse_content
from ..eraser import remove_object_by_id from ..eraser import remove_object_by_id
@ -13,8 +14,8 @@ class IEEEXplore(Plugin):
""" """
@staticmethod @classmethod
def scrub(content): def scrub(cls, content, verbose=0):
evil_ids = [] evil_ids = []
# parse the pdf into a pdfminer document # parse the pdf into a pdfminer document
@ -37,7 +38,13 @@ class IEEEXplore(Plugin):
#rawdata = copy(obj.rawdata) #rawdata = copy(obj.rawdata)
data = copy(obj.get_data()) data = copy(obj.get_data())
if "Authorized licensed use limited to: " in data: phrase= "Authorized licensed use limited to: "
if phrase in data:
if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
evil_ids.append(objid) evil_ids.append(objid)
for objid in evil_ids: for objid in evil_ids:

View File

@ -2,9 +2,10 @@
from copy import copy from copy import copy
import sys
from ..parser import parse_content from ..parser import parse_content
from ..eraser import ( from ..eraser import (
remove_object_by_id,
replace_object_with, replace_object_with,
) )
from ..plugin import Plugin from ..plugin import Plugin
@ -32,8 +33,8 @@ class JSTOR(Plugin):
"This content downloaded on", "This content downloaded on",
] ]
@staticmethod @classmethod
def scrub(content): def scrub(cls, content, verbose=0):
replacements = [] replacements = []
# jstor has certain watermarks only on the first page # jstor has certain watermarks only on the first page
@ -54,8 +55,6 @@ class JSTOR(Plugin):
if hasattr(obj, "attrs"): if hasattr(obj, "attrs"):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
length = obj.attrs["Length"]
rawdata = copy(obj.rawdata)
data = copy(obj.get_data()) data = copy(obj.get_data())
# make sure all of the requirements are in there # make sure all of the requirements are in there
@ -66,6 +65,9 @@ class JSTOR(Plugin):
startpos = better_content.find("This content downloaded ") startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos) endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos] segment = better_content[startpos:endpos]
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content.replace(segment, "") better_content = better_content.replace(segment, "")
# it looks like all of the watermarks are at the end? # it looks like all of the watermarks are at the end?
@ -82,12 +84,19 @@ class JSTOR(Plugin):
if page_id == 0 and "/F2 11 Tf\n" in better_content: if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n") startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5) endpos = better_content.find("Tf\n", startpos+5)
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
better_content = better_content[0:startpos] + better_content[endpos:] better_content = better_content[0:startpos] + better_content[endpos:]
replacements.append([objid, better_content]) replacements.append([objid, better_content])
page_id += 1 page_id += 1
if verbose >= 1 and replacements:
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
for deets in replacements: for deets in replacements:
objid = deets[0] objid = deets[0]
replacement = deets[1] replacement = deets[1]