From 380bc289b33eeaf175dd9cce524161ef289c28fc Mon Sep 17 00:00:00 2001 From: vi Date: Sat, 7 Dec 2013 07:12:19 +0800 Subject: [PATCH 1/3] Adapt to PDFMiner's breaking interface changes (#37). --- pdfparanoia/parser.py | 8 +++----- pdfparanoia/plugins/aip.py | 3 +-- pdfparanoia/plugins/ieee.py | 3 +-- pdfparanoia/plugins/jstor.py | 3 +-- pdfparanoia/plugins/rsc.py | 3 +-- pdfparanoia/plugins/sciencemagazine.py | 3 +-- 6 files changed, 8 insertions(+), 15 deletions(-) diff --git a/pdfparanoia/parser.py b/pdfparanoia/parser.py index 846008b..5fe2420 100644 --- a/pdfparanoia/parser.py +++ b/pdfparanoia/parser.py @@ -16,6 +16,7 @@ except ImportError: # py3k # from pdfquery import PDFQuery import pdfminer.pdfparser +import pdfminer.pdfdocument from .eraser import replace_object_with @@ -28,9 +29,7 @@ def parse_pdf(handler): # setup for parsing parser = pdfminer.pdfparser.PDFParser(handler) - doc = pdfminer.pdfparser.PDFDocument() - parser.set_document(doc) - doc.set_parser(parser) + doc = pdfminer.pdfdocument.PDFDocument(parser) # actual parsing doc.initialize() @@ -58,8 +57,7 @@ def deflate(content): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # store new replacements diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py index 9e93e71..e824b38 100644 --- a/pdfparanoia/plugins/aip.py +++ b/pdfparanoia/plugins/aip.py @@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py index 4acb9d6..f8c22bf 100644 --- a/pdfparanoia/plugins/ieee.py +++ b/pdfparanoia/plugins/ieee.py @@ -22,8 +22,7 @@ class IEEEXplore(Plugin): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index 0ca971d..abb2d28 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -44,8 +44,7 @@ class JSTOR(Plugin): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf diff --git a/pdfparanoia/plugins/rsc.py b/pdfparanoia/plugins/rsc.py index 524fc32..3125ae5 100644 --- a/pdfparanoia/plugins/rsc.py +++ b/pdfparanoia/plugins/rsc.py @@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf diff --git a/pdfparanoia/plugins/sciencemagazine.py b/pdfparanoia/plugins/sciencemagazine.py index 32c25eb..b4e5c5c 100644 --- a/pdfparanoia/plugins/sciencemagazine.py +++ b/pdfparanoia/plugins/sciencemagazine.py @@ -27,8 +27,7 @@ class ScienceMagazine(Plugin): pdf = parse_content(content) # get a list of all object ids - xrefs = pdf._parser.read_xref() - xref = xrefs[0] + xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf From 95a249d8ab955226c58e82608af8c155b74295b6 Mon Sep 17 00:00:00 2001 From: vi Date: Sat, 7 Dec 2013 07:14:41 +0800 Subject: [PATCH 2/3] Package: use a version of PDFMiner since the interface change (#37). --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7eaca79..4e16419 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pdfminer>=0 +pdfminer>=20131113 diff --git a/setup.py b/setup.py index 1125ab2..fee847a 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).re if platform.python_version() >= "3.0.0": dependencies = ["pdfminer3k>=1.3.0"] else: - dependencies = ["pdfminer>=0"] + dependencies = ["pdfminer>=20131113"] packages = [ "pdfparanoia", From e95374ec044c84b154b2595591732648ebb2b1b8 Mon Sep 17 00:00:00 2001 From: vi Date: Sat, 7 Dec 2013 07:20:13 +0800 Subject: [PATCH 3/3] getobj can raise PDFObjectNotFound --- pdfparanoia/plugins/jstor.py | 75 +++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index abb2d28..5821751 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -10,6 +10,8 @@ from ..eraser import ( ) from ..plugin import Plugin +from pdfminer.pdftypes import PDFObjectNotFound + class JSTOR(Plugin): """ JSTOR @@ -50,48 +52,51 @@ class JSTOR(Plugin): # check each object in the pdf for objid in objids: # get an object by id - obj = pdf.getobj(objid) + try: + obj = pdf.getobj(objid) - if hasattr(obj, "attrs"): - if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - data = copy(obj.get_data()) + if hasattr(obj, "attrs"): + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + data = copy(obj.get_data()) - # make sure all of the requirements are in there - if all([requirement in data for requirement in JSTOR.requirements]): - better_content = data - - # remove the date - startpos = better_content.find("This content downloaded ") - endpos = better_content.find(")", startpos) - segment = better_content[startpos:endpos] - if verbose >= 2 and replacements: - sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) - - better_content = better_content.replace(segment, "") - - # it looks like all of the watermarks are at the end? - better_content = better_content[:-160] - - # "Accessed on dd/mm/yyy hh:mm" - # - # the "Accessed" line is only on the first page - # - # it's based on /F2 - # - # This would be better if it could be decoded to - # actually search for the "Accessed" text. - if page_id == 0 and "/F2 11 Tf\n" in better_content: - startpos = better_content.rfind("/F2 11 Tf\n") - endpos = better_content.find("Tf\n", startpos+5) + # make sure all of the requirements are in there + if all([requirement in data for requirement in JSTOR.requirements]): + better_content = data + # remove the date + startpos = better_content.find("This content downloaded ") + endpos = better_content.find(")", startpos) + segment = better_content[startpos:endpos] if verbose >= 2 and replacements: - sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) - better_content = better_content[0:startpos] + better_content[endpos:] + better_content = better_content.replace(segment, "") - replacements.append([objid, better_content]) + # it looks like all of the watermarks are at the end? + better_content = better_content[:-160] - page_id += 1 + # "Accessed on dd/mm/yyy hh:mm" + # + # the "Accessed" line is only on the first page + # + # it's based on /F2 + # + # This would be better if it could be decoded to + # actually search for the "Accessed" text. + if page_id == 0 and "/F2 11 Tf\n" in better_content: + startpos = better_content.rfind("/F2 11 Tf\n") + endpos = better_content.find("Tf\n", startpos+5) + + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + + better_content = better_content[0:startpos] + better_content[endpos:] + + replacements.append([objid, better_content]) + + page_id += 1 + except PDFObjectNotFound, e: + print >>sys.stderr, 'Missing object: %r' % e if verbose >= 1 and replacements: sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))