mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
commit
ee483ab986
@ -13,6 +13,15 @@ if __name__ == "__main__":
|
|||||||
import fileinput
|
import fileinput
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
verbose = 0
|
||||||
|
while '--verbose' in sys.argv:
|
||||||
|
verbose += 1
|
||||||
|
sys.argv.pop(sys.argv.index('--verbose'))
|
||||||
|
|
||||||
|
while '-v' in sys.argv:
|
||||||
|
verbose += 1
|
||||||
|
sys.argv.pop(sys.argv.index('-v'))
|
||||||
|
|
||||||
import pdfparanoia
|
import pdfparanoia
|
||||||
|
|
||||||
# read in all lines
|
# read in all lines
|
||||||
@ -21,7 +30,7 @@ if __name__ == "__main__":
|
|||||||
content += line
|
content += line
|
||||||
|
|
||||||
# scrub the pdf to get rid of watermarks
|
# scrub the pdf to get rid of watermarks
|
||||||
output = pdfparanoia.scrub(StringIO(content))
|
output = pdfparanoia.scrub(StringIO(content), verbose=verbose)
|
||||||
|
|
||||||
# dump to output
|
# dump to output
|
||||||
sys.stdout.write(output)
|
sys.stdout.write(output)
|
||||||
|
@ -32,7 +32,7 @@ def find_plugins():
|
|||||||
plugins = [each[1] for each in plugins]
|
plugins = [each[1] for each in plugins]
|
||||||
return plugins
|
return plugins
|
||||||
|
|
||||||
def scrub(obj):
|
def scrub(obj, verbose=False):
|
||||||
"""
|
"""
|
||||||
Removes watermarks from a pdf and returns the resulting pdf as a string.
|
Removes watermarks from a pdf and returns the resulting pdf as a string.
|
||||||
"""
|
"""
|
||||||
@ -50,7 +50,7 @@ def scrub(obj):
|
|||||||
|
|
||||||
# clean this pdf as much as possible
|
# clean this pdf as much as possible
|
||||||
for plugin in plugins:
|
for plugin in plugins:
|
||||||
content = plugin.scrub(content)
|
content = plugin.scrub(content, verbose=verbose)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@ -8,8 +8,8 @@ Defines how plugins work.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
class Plugin:
|
class Plugin:
|
||||||
@staticmethod
|
@classmethod
|
||||||
def scrub(content):
|
def scrub(cls, content, verbose=False):
|
||||||
"""
|
"""
|
||||||
Removes watermarks from the given pdf.
|
Removes watermarks from the given pdf.
|
||||||
"""
|
"""
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
import sys
|
||||||
|
|
||||||
from ..parser import parse_content
|
from ..parser import parse_content
|
||||||
from ..eraser import remove_object_by_id
|
|
||||||
from ..plugin import Plugin
|
from ..plugin import Plugin
|
||||||
|
|
||||||
class SPIE(Plugin):
|
class SPIE(Plugin):
|
||||||
@ -18,8 +18,8 @@ class SPIE(Plugin):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def scrub(content):
|
def scrub(cls, content, verbose=False):
|
||||||
evil_ids = []
|
evil_ids = []
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# parse the pdf into a pdfminer document
|
||||||
@ -40,7 +40,10 @@ class SPIE(Plugin):
|
|||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
if "Downloaded From:" in data:
|
phrase="Downloaded From:"
|
||||||
|
if phrase in data:
|
||||||
|
if verbose:
|
||||||
|
sys.stderr.write("%s: found object %s with %r; omitting..." % (cls.__name__, objid, phrase))
|
||||||
evil_ids.append(objid)
|
evil_ids.append(objid)
|
||||||
|
|
||||||
for objid in evil_ids:
|
for objid in evil_ids:
|
@ -10,5 +10,4 @@ Scrubbing machines. Bubbles mandatory.
|
|||||||
from .aip import *
|
from .aip import *
|
||||||
from .ieee import *
|
from .ieee import *
|
||||||
from .jstor import *
|
from .jstor import *
|
||||||
from .spie import *
|
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..parser import parse_content
|
from ..parser import parse_content
|
||||||
@ -15,8 +17,8 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||||||
attached for whatever reason.
|
attached for whatever reason.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def scrub(content):
|
def scrub(cls, content, verbose=0):
|
||||||
evil_ids = []
|
evil_ids = []
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# parse the pdf into a pdfminer document
|
||||||
@ -42,7 +44,13 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||||||
#rawdata = copy(obj.rawdata)
|
#rawdata = copy(obj.rawdata)
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
if "Redistribution subject to AIP license or copyright" in data:
|
phrase="Redistribution subject to AIP license or copyright"
|
||||||
|
if phrase in data:
|
||||||
|
if verbose >= 2:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
||||||
|
elif verbose >= 1:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
||||||
|
|
||||||
evil_ids.append(objid)
|
evil_ids.append(objid)
|
||||||
|
|
||||||
for objid in evil_ids:
|
for objid in evil_ids:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
import sys
|
||||||
|
|
||||||
from ..parser import parse_content
|
from ..parser import parse_content
|
||||||
from ..eraser import remove_object_by_id
|
from ..eraser import remove_object_by_id
|
||||||
@ -13,8 +14,8 @@ class IEEEXplore(Plugin):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def scrub(content):
|
def scrub(cls, content, verbose=0):
|
||||||
evil_ids = []
|
evil_ids = []
|
||||||
|
|
||||||
# parse the pdf into a pdfminer document
|
# parse the pdf into a pdfminer document
|
||||||
@ -37,7 +38,13 @@ class IEEEXplore(Plugin):
|
|||||||
#rawdata = copy(obj.rawdata)
|
#rawdata = copy(obj.rawdata)
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
if "Authorized licensed use limited to: " in data:
|
phrase= "Authorized licensed use limited to: "
|
||||||
|
if phrase in data:
|
||||||
|
if verbose >= 2:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||||
|
elif verbose >= 1:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,))
|
||||||
|
|
||||||
evil_ids.append(objid)
|
evil_ids.append(objid)
|
||||||
|
|
||||||
for objid in evil_ids:
|
for objid in evil_ids:
|
||||||
|
@ -2,9 +2,10 @@
|
|||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from ..parser import parse_content
|
from ..parser import parse_content
|
||||||
from ..eraser import (
|
from ..eraser import (
|
||||||
remove_object_by_id,
|
|
||||||
replace_object_with,
|
replace_object_with,
|
||||||
)
|
)
|
||||||
from ..plugin import Plugin
|
from ..plugin import Plugin
|
||||||
@ -32,8 +33,8 @@ class JSTOR(Plugin):
|
|||||||
"This content downloaded on",
|
"This content downloaded on",
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@classmethod
|
||||||
def scrub(content):
|
def scrub(cls, content, verbose=0):
|
||||||
replacements = []
|
replacements = []
|
||||||
|
|
||||||
# jstor has certain watermarks only on the first page
|
# jstor has certain watermarks only on the first page
|
||||||
@ -54,8 +55,6 @@ class JSTOR(Plugin):
|
|||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
length = obj.attrs["Length"]
|
|
||||||
rawdata = copy(obj.rawdata)
|
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
# make sure all of the requirements are in there
|
# make sure all of the requirements are in there
|
||||||
@ -66,6 +65,9 @@ class JSTOR(Plugin):
|
|||||||
startpos = better_content.find("This content downloaded ")
|
startpos = better_content.find("This content downloaded ")
|
||||||
endpos = better_content.find(")", startpos)
|
endpos = better_content.find(")", startpos)
|
||||||
segment = better_content[startpos:endpos]
|
segment = better_content[startpos:endpos]
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||||
|
|
||||||
better_content = better_content.replace(segment, "")
|
better_content = better_content.replace(segment, "")
|
||||||
|
|
||||||
# it looks like all of the watermarks are at the end?
|
# it looks like all of the watermarks are at the end?
|
||||||
@ -82,12 +84,19 @@ class JSTOR(Plugin):
|
|||||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||||
endpos = better_content.find("Tf\n", startpos+5)
|
endpos = better_content.find("Tf\n", startpos+5)
|
||||||
|
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||||
|
|
||||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||||
|
|
||||||
replacements.append([objid, better_content])
|
replacements.append([objid, better_content])
|
||||||
|
|
||||||
page_id += 1
|
page_id += 1
|
||||||
|
|
||||||
|
if verbose >= 1 and replacements:
|
||||||
|
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||||
|
|
||||||
for deets in replacements:
|
for deets in replacements:
|
||||||
objid = deets[0]
|
objid = deets[0]
|
||||||
replacement = deets[1]
|
replacement = deets[1]
|
||||||
|
Loading…
Reference in New Issue
Block a user