1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-12-04 23:15:52 +01:00

Merge pull request #38 from fmap/pdfminer-api

PDFMiner made breaking interface changes
This commit is contained in:
Bryan Bishop 2013-12-06 15:27:42 -08:00
commit 5cc682e2c5
8 changed files with 50 additions and 52 deletions

View File

@ -16,6 +16,7 @@ except ImportError: # py3k
# from pdfquery import PDFQuery # from pdfquery import PDFQuery
import pdfminer.pdfparser import pdfminer.pdfparser
import pdfminer.pdfdocument
from .eraser import replace_object_with from .eraser import replace_object_with
@ -28,9 +29,7 @@ def parse_pdf(handler):
# setup for parsing # setup for parsing
parser = pdfminer.pdfparser.PDFParser(handler) parser = pdfminer.pdfparser.PDFParser(handler)
doc = pdfminer.pdfparser.PDFDocument() doc = pdfminer.pdfdocument.PDFDocument(parser)
parser.set_document(doc)
doc.set_parser(parser)
# actual parsing # actual parsing
doc.initialize() doc.initialize()
@ -58,8 +57,7 @@ def deflate(content):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# store new replacements # store new replacements

View File

@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf

View File

@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf

View File

@ -10,6 +10,8 @@ from ..eraser import (
) )
from ..plugin import Plugin from ..plugin import Plugin
from pdfminer.pdftypes import PDFObjectNotFound
class JSTOR(Plugin): class JSTOR(Plugin):
""" """
JSTOR JSTOR
@ -44,55 +46,57 @@ class JSTOR(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf
for objid in objids: for objid in objids:
# get an object by id # get an object by id
obj = pdf.getobj(objid) try:
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"): if hasattr(obj, "attrs"):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
data = copy(obj.get_data()) data = copy(obj.get_data())
# make sure all of the requirements are in there # make sure all of the requirements are in there
if all([requirement in data for requirement in JSTOR.requirements]): if all([requirement in data for requirement in JSTOR.requirements]):
better_content = data better_content = data
# remove the date
startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos]
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content.replace(segment, "")
# it looks like all of the watermarks are at the end?
better_content = better_content[:-160]
# "Accessed on dd/mm/yyy hh:mm"
#
# the "Accessed" line is only on the first page
#
# it's based on /F2
#
# This would be better if it could be decoded to
# actually search for the "Accessed" text.
if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5)
# remove the date
startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos]
if verbose >= 2 and replacements: if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content[0:startpos] + better_content[endpos:] better_content = better_content.replace(segment, "")
replacements.append([objid, better_content]) # it looks like all of the watermarks are at the end?
better_content = better_content[:-160]
page_id += 1 # "Accessed on dd/mm/yyy hh:mm"
#
# the "Accessed" line is only on the first page
#
# it's based on /F2
#
# This would be better if it could be decoded to
# actually search for the "Accessed" text.
if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5)
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
better_content = better_content[0:startpos] + better_content[endpos:]
replacements.append([objid, better_content])
page_id += 1
except PDFObjectNotFound, e:
print >>sys.stderr, 'Missing object: %r' % e
if verbose >= 1 and replacements: if verbose >= 1 and replacements:
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements)) sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))

View File

@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf

View File

@ -27,8 +27,7 @@ class ScienceMagazine(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf

View File

@ -1 +1 @@
pdfminer>=0 pdfminer>=20131113

View File

@ -11,7 +11,7 @@ long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).re
if platform.python_version() >= "3.0.0": if platform.python_version() >= "3.0.0":
dependencies = ["pdfminer3k>=1.3.0"] dependencies = ["pdfminer3k>=1.3.0"]
else: else:
dependencies = ["pdfminer>=0"] dependencies = ["pdfminer>=20131113"]
packages = [ packages = [
"pdfparanoia", "pdfparanoia",