From e95374ec044c84b154b2595591732648ebb2b1b8 Mon Sep 17 00:00:00 2001 From: vi Date: Sat, 7 Dec 2013 07:20:13 +0800 Subject: [PATCH] getobj can raise PDFObjectNotFound --- pdfparanoia/plugins/jstor.py | 75 +++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py index abb2d28..5821751 100644 --- a/pdfparanoia/plugins/jstor.py +++ b/pdfparanoia/plugins/jstor.py @@ -10,6 +10,8 @@ from ..eraser import ( ) from ..plugin import Plugin +from pdfminer.pdftypes import PDFObjectNotFound + class JSTOR(Plugin): """ JSTOR @@ -50,48 +52,51 @@ class JSTOR(Plugin): # check each object in the pdf for objid in objids: # get an object by id - obj = pdf.getobj(objid) + try: + obj = pdf.getobj(objid) - if hasattr(obj, "attrs"): - if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": - data = copy(obj.get_data()) + if hasattr(obj, "attrs"): + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + data = copy(obj.get_data()) - # make sure all of the requirements are in there - if all([requirement in data for requirement in JSTOR.requirements]): - better_content = data - - # remove the date - startpos = better_content.find("This content downloaded ") - endpos = better_content.find(")", startpos) - segment = better_content[startpos:endpos] - if verbose >= 2 and replacements: - sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) - - better_content = better_content.replace(segment, "") - - # it looks like all of the watermarks are at the end? - better_content = better_content[:-160] - - # "Accessed on dd/mm/yyy hh:mm" - # - # the "Accessed" line is only on the first page - # - # it's based on /F2 - # - # This would be better if it could be decoded to - # actually search for the "Accessed" text. - if page_id == 0 and "/F2 11 Tf\n" in better_content: - startpos = better_content.rfind("/F2 11 Tf\n") - endpos = better_content.find("Tf\n", startpos+5) + # make sure all of the requirements are in there + if all([requirement in data for requirement in JSTOR.requirements]): + better_content = data + # remove the date + startpos = better_content.find("This content downloaded ") + endpos = better_content.find(")", startpos) + segment = better_content[startpos:endpos] if verbose >= 2 and replacements: - sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment)) - better_content = better_content[0:startpos] + better_content[endpos:] + better_content = better_content.replace(segment, "") - replacements.append([objid, better_content]) + # it looks like all of the watermarks are at the end? + better_content = better_content[:-160] - page_id += 1 + # "Accessed on dd/mm/yyy hh:mm" + # + # the "Accessed" line is only on the first page + # + # it's based on /F2 + # + # This would be better if it could be decoded to + # actually search for the "Accessed" text. + if page_id == 0 and "/F2 11 Tf\n" in better_content: + startpos = better_content.rfind("/F2 11 Tf\n") + endpos = better_content.find("Tf\n", startpos+5) + + if verbose >= 2 and replacements: + sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) + + better_content = better_content[0:startpos] + better_content[endpos:] + + replacements.append([objid, better_content]) + + page_id += 1 + except PDFObjectNotFound, e: + print >>sys.stderr, 'Missing object: %r' % e if verbose >= 1 and replacements: sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))