From 380bc289b33eeaf175dd9cce524161ef289c28fc Mon Sep 17 00:00:00 2001
From: vi <noreply@example.org>
Date: Sat, 7 Dec 2013 07:12:19 +0800
Subject: [PATCH 1/3] Adapt to PDFMiner's breaking interface changes (#37).

---
 pdfparanoia/parser.py                  | 8 +++-----
 pdfparanoia/plugins/aip.py             | 3 +--
 pdfparanoia/plugins/ieee.py            | 3 +--
 pdfparanoia/plugins/jstor.py           | 3 +--
 pdfparanoia/plugins/rsc.py             | 3 +--
 pdfparanoia/plugins/sciencemagazine.py | 3 +--
 6 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/pdfparanoia/parser.py b/pdfparanoia/parser.py
index 846008b..5fe2420 100644
--- a/pdfparanoia/parser.py
+++ b/pdfparanoia/parser.py
@@ -16,6 +16,7 @@ except ImportError: # py3k
 # from pdfquery import PDFQuery
 
 import pdfminer.pdfparser
+import pdfminer.pdfdocument
 
 from .eraser import replace_object_with
 
@@ -28,9 +29,7 @@ def parse_pdf(handler):
 
     # setup for parsing
     parser = pdfminer.pdfparser.PDFParser(handler)
-    doc = pdfminer.pdfparser.PDFDocument()
-    parser.set_document(doc)
-    doc.set_parser(parser)
+    doc = pdfminer.pdfdocument.PDFDocument(parser)
 
     # actual parsing
     doc.initialize()
@@ -58,8 +57,7 @@ def deflate(content):
     pdf = parse_content(content)
 
     # get a list of all object ids
-    xrefs = pdf._parser.read_xref()
-    xref = xrefs[0]
+    xref = pdf.xrefs[0]
     objids = xref.get_objids()
 
     # store new replacements
diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py
index 9e93e71..e824b38 100644
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
         pdf = parse_content(content)
 
         # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
         objids = xref.get_objids()
 
         # check each object in the pdf
diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py
index 4acb9d6..f8c22bf 100644
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
         pdf = parse_content(content)
 
         # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
         objids = xref.get_objids()
 
         # check each object in the pdf
diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py
index 0ca971d..abb2d28 100644
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@@ -44,8 +44,7 @@ class JSTOR(Plugin):
         pdf = parse_content(content)
 
         # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
         objids = xref.get_objids()
 
         # check each object in the pdf
diff --git a/pdfparanoia/plugins/rsc.py b/pdfparanoia/plugins/rsc.py
index 524fc32..3125ae5 100644
--- a/pdfparanoia/plugins/rsc.py
+++ b/pdfparanoia/plugins/rsc.py
@@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
             pdf = parse_content(content)
 
             # get a list of all object ids
-            xrefs = pdf._parser.read_xref()
-            xref = xrefs[0]
+            xref = pdf.xrefs[0]
             objids = xref.get_objids()
 
             # check each object in the pdf
diff --git a/pdfparanoia/plugins/sciencemagazine.py b/pdfparanoia/plugins/sciencemagazine.py
index 32c25eb..b4e5c5c 100644
--- a/pdfparanoia/plugins/sciencemagazine.py
+++ b/pdfparanoia/plugins/sciencemagazine.py
@@ -27,8 +27,7 @@ class ScienceMagazine(Plugin):
         pdf = parse_content(content)
 
         # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
         objids = xref.get_objids()
 
         # check each object in the pdf

From 95a249d8ab955226c58e82608af8c155b74295b6 Mon Sep 17 00:00:00 2001
From: vi <noreply@example.org>
Date: Sat, 7 Dec 2013 07:14:41 +0800
Subject: [PATCH 2/3] Package: use a version of PDFMiner since the interface
 change (#37).

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7eaca79..4e16419 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-pdfminer>=0
+pdfminer>=20131113
diff --git a/setup.py b/setup.py
index 1125ab2..fee847a 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).re
 if platform.python_version() >= "3.0.0":
     dependencies = ["pdfminer3k>=1.3.0"]
 else:
-    dependencies = ["pdfminer>=0"]
+    dependencies = ["pdfminer>=20131113"]
 
 packages = [
     "pdfparanoia",

From e95374ec044c84b154b2595591732648ebb2b1b8 Mon Sep 17 00:00:00 2001
From: vi <noreply@example.org>
Date: Sat, 7 Dec 2013 07:20:13 +0800
Subject: [PATCH 3/3] getobj can raise PDFObjectNotFound

---
 pdfparanoia/plugins/jstor.py | 75 +++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/pdfparanoia/plugins/jstor.py b/pdfparanoia/plugins/jstor.py
index abb2d28..5821751 100644
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@@ -10,6 +10,8 @@ from ..eraser import (
 )
 from ..plugin import Plugin
 
+from pdfminer.pdftypes import PDFObjectNotFound
+
 class JSTOR(Plugin):
     """
     JSTOR
@@ -50,48 +52,51 @@ class JSTOR(Plugin):
         # check each object in the pdf
         for objid in objids:
             # get an object by id
-            obj = pdf.getobj(objid)
+            try:
+                obj = pdf.getobj(objid)
 
-            if hasattr(obj, "attrs"):
-                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
-                    data = copy(obj.get_data())
+                if hasattr(obj, "attrs"):
+                    if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                        data = copy(obj.get_data())
 
-                    # make sure all of the requirements are in there
-                    if all([requirement in data for requirement in JSTOR.requirements]):
-                        better_content = data
-
-                        # remove the date
-                        startpos = better_content.find("This content downloaded ")
-                        endpos = better_content.find(")", startpos)
-                        segment = better_content[startpos:endpos]
-                        if verbose >= 2 and replacements:
-                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
-
-                        better_content = better_content.replace(segment, "")
-
-                        # it looks like all of the watermarks are at the end?
-                        better_content = better_content[:-160]
-
-                        # "Accessed on dd/mm/yyy hh:mm"
-                        #
-                        # the "Accessed" line is only on the first page
-                        #
-                        # it's based on /F2
-                        #
-                        # This would be better if it could be decoded to
-                        # actually search for the "Accessed" text.
-                        if page_id == 0 and "/F2 11 Tf\n" in better_content:
-                            startpos = better_content.rfind("/F2 11 Tf\n")
-                            endpos = better_content.find("Tf\n", startpos+5)
+                        # make sure all of the requirements are in there
+                        if all([requirement in data for requirement in JSTOR.requirements]):
+                            better_content = data
 
+                            # remove the date
+                            startpos = better_content.find("This content downloaded ")
+                            endpos = better_content.find(")", startpos)
+                            segment = better_content[startpos:endpos]
                             if verbose >= 2 and replacements:
-                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
 
-                            better_content = better_content[0:startpos] + better_content[endpos:]
+                            better_content = better_content.replace(segment, "")
 
-                        replacements.append([objid, better_content])
+                            # it looks like all of the watermarks are at the end?
+                            better_content = better_content[:-160]
 
-                        page_id += 1
+                            # "Accessed on dd/mm/yyy hh:mm"
+                            #
+                            # the "Accessed" line is only on the first page
+                            #
+                            # it's based on /F2
+                            #
+                            # This would be better if it could be decoded to
+                            # actually search for the "Accessed" text.
+                            if page_id == 0 and "/F2 11 Tf\n" in better_content:
+                                startpos = better_content.rfind("/F2 11 Tf\n")
+                                endpos = better_content.find("Tf\n", startpos+5)
+
+                                if verbose >= 2 and replacements:
+                                    sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+
+                                better_content = better_content[0:startpos] + better_content[endpos:]
+
+                            replacements.append([objid, better_content])
+
+                            page_id += 1
+            except PDFObjectNotFound, e:
+                print >>sys.stderr, 'Missing object: %r' % e
 
         if verbose >= 1 and replacements:
             sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))