Merge pull request #38 from fmap/pdfminer-api

PDFMiner made breaking interface changes
2025-07-15 14:08:21 +02:00 · 2013-12-06 15:27:42 -08:00 · 2013-12-06 15:27:42 -08:00 · 5cc682e2c5
commit 5cc682e2c5
parent 713776af67 e95374ec04
8 changed files with 50 additions and 52 deletions
--- a/pdfparanoia/parser.py
+++ b/pdfparanoia/parser.py
@ -16,6 +16,7 @@ except ImportError: # py3k
 # from pdfquery import PDFQuery

 import pdfminer.pdfparser
+import pdfminer.pdfdocument

 from .eraser import replace_object_with

@ -28,9 +29,7 @@ def parse_pdf(handler):

    # setup for parsing
    parser = pdfminer.pdfparser.PDFParser(handler)
-    doc = pdfminer.pdfparser.PDFDocument()
-    parser.set_document(doc)
-    doc.set_parser(parser)
+    doc = pdfminer.pdfdocument.PDFDocument(parser)

    # actual parsing
    doc.initialize()
@ -58,8 +57,7 @@ def deflate(content):
    pdf = parse_content(content)

    # get a list of all object ids
-    xrefs = pdf._parser.read_xref()
-    xref = xrefs[0]
+    xref = pdf.xrefs[0]
    objids = xref.get_objids()

    # store new replacements
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
        pdf = parse_content(content)

        # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
        objids = xref.get_objids()

        # check each object in the pdf
--- a/pdfparanoia/plugins/ieee.py
+++ b/pdfparanoia/plugins/ieee.py
@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
        pdf = parse_content(content)

        # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
        objids = xref.get_objids()

        # check each object in the pdf
--- a/pdfparanoia/plugins/jstor.py
+++ b/pdfparanoia/plugins/jstor.py
@ -10,6 +10,8 @@ from ..eraser import (
 )
 from ..plugin import Plugin

+from pdfminer.pdftypes import PDFObjectNotFound
+
 class JSTOR(Plugin):
    """
    JSTOR
@ -44,55 +46,57 @@ class JSTOR(Plugin):
        pdf = parse_content(content)

        # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
        objids = xref.get_objids()

        # check each object in the pdf
        for objid in objids:
            # get an object by id
-            obj = pdf.getobj(objid)
+            try:
+                obj = pdf.getobj(objid)

-            if hasattr(obj, "attrs"):
-                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
-                    data = copy(obj.get_data())
+                if hasattr(obj, "attrs"):
+                    if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                        data = copy(obj.get_data())

-                    # make sure all of the requirements are in there
-                    if all([requirement in data for requirement in JSTOR.requirements]):
-                        better_content = data
-
-                        # remove the date
-                        startpos = better_content.find("This content downloaded ")
-                        endpos = better_content.find(")", startpos)
-                        segment = better_content[startpos:endpos]
-                        if verbose >= 2 and replacements:
-                            sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
-
-                        better_content = better_content.replace(segment, "")
-
-                        # it looks like all of the watermarks are at the end?
-                        better_content = better_content[:-160]
-
-                        # "Accessed on dd/mm/yyy hh:mm"
-                        #
-                        # the "Accessed" line is only on the first page
-                        #
-                        # it's based on /F2
-                        #
-                        # This would be better if it could be decoded to
-                        # actually search for the "Accessed" text.
-                        if page_id == 0 and "/F2 11 Tf\n" in better_content:
-                            startpos = better_content.rfind("/F2 11 Tf\n")
-                            endpos = better_content.find("Tf\n", startpos+5)
+                        # make sure all of the requirements are in there
+                        if all([requirement in data for requirement in JSTOR.requirements]):
+                            better_content = data

+                            # remove the date
+                            startpos = better_content.find("This content downloaded ")
+                            endpos = better_content.find(")", startpos)
+                            segment = better_content[startpos:endpos]
                            if verbose >= 2 and replacements:
-                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+                                sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))

-                            better_content = better_content[0:startpos] + better_content[endpos:]
+                            better_content = better_content.replace(segment, "")

-                        replacements.append([objid, better_content])
+                            # it looks like all of the watermarks are at the end?
+                            better_content = better_content[:-160]

-                        page_id += 1
+                            # "Accessed on dd/mm/yyy hh:mm"
+                            #
+                            # the "Accessed" line is only on the first page
+                            #
+                            # it's based on /F2
+                            #
+                            # This would be better if it could be decoded to
+                            # actually search for the "Accessed" text.
+                            if page_id == 0 and "/F2 11 Tf\n" in better_content:
+                                startpos = better_content.rfind("/F2 11 Tf\n")
+                                endpos = better_content.find("Tf\n", startpos+5)
+
+                                if verbose >= 2 and replacements:
+                                    sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
+
+                                better_content = better_content[0:startpos] + better_content[endpos:]
+
+                            replacements.append([objid, better_content])
+
+                            page_id += 1
+            except PDFObjectNotFound, e:
+                print >>sys.stderr, 'Missing object: %r' % e

        if verbose >= 1 and replacements:
            sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
--- a/pdfparanoia/plugins/rsc.py
+++ b/pdfparanoia/plugins/rsc.py
@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
            pdf = parse_content(content)

            # get a list of all object ids
-            xrefs = pdf._parser.read_xref()
-            xref = xrefs[0]
+            xref = pdf.xrefs[0]
            objids = xref.get_objids()

            # check each object in the pdf
--- a/pdfparanoia/plugins/sciencemagazine.py
+++ b/pdfparanoia/plugins/sciencemagazine.py
@ -27,8 +27,7 @@ class ScienceMagazine(Plugin):
        pdf = parse_content(content)

        # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+        xref = pdf.xrefs[0]
        objids = xref.get_objids()

        # check each object in the pdf
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1 @@
-pdfminer>=0
+pdfminer>=20131113
--- a/setup.py
+++ b/setup.py
@ -11,7 +11,7 @@ long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).re
 if platform.python_version() >= "3.0.0":
    dependencies = ["pdfminer3k>=1.3.0"]
 else:
-    dependencies = ["pdfminer>=0"]
+    dependencies = ["pdfminer>=20131113"]

 packages = [
    "pdfparanoia",