mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
Merge pull request #38 from fmap/pdfminer-api
PDFMiner made breaking interface changes
This commit is contained in:
commit
5cc682e2c5
@ -16,6 +16,7 @@ except ImportError: # py3k
|
|||||||
# from pdfquery import PDFQuery
|
# from pdfquery import PDFQuery
|
||||||
|
|
||||||
import pdfminer.pdfparser
|
import pdfminer.pdfparser
|
||||||
|
import pdfminer.pdfdocument
|
||||||
|
|
||||||
from .eraser import replace_object_with
|
from .eraser import replace_object_with
|
||||||
|
|
||||||
@ -28,9 +29,7 @@ def parse_pdf(handler):
|
|||||||
|
|
||||||
# setup for parsing
|
# setup for parsing
|
||||||
parser = pdfminer.pdfparser.PDFParser(handler)
|
parser = pdfminer.pdfparser.PDFParser(handler)
|
||||||
doc = pdfminer.pdfparser.PDFDocument()
|
doc = pdfminer.pdfdocument.PDFDocument(parser)
|
||||||
parser.set_document(doc)
|
|
||||||
doc.set_parser(parser)
|
|
||||||
|
|
||||||
# actual parsing
|
# actual parsing
|
||||||
doc.initialize()
|
doc.initialize()
|
||||||
@ -58,8 +57,7 @@ def deflate(content):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# store new replacements
|
# store new replacements
|
||||||
|
@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -10,6 +10,8 @@ from ..eraser import (
|
|||||||
)
|
)
|
||||||
from ..plugin import Plugin
|
from ..plugin import Plugin
|
||||||
|
|
||||||
|
from pdfminer.pdftypes import PDFObjectNotFound
|
||||||
|
|
||||||
class JSTOR(Plugin):
|
class JSTOR(Plugin):
|
||||||
"""
|
"""
|
||||||
JSTOR
|
JSTOR
|
||||||
@ -44,55 +46,57 @@ class JSTOR(Plugin):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
# get an object by id
|
# get an object by id
|
||||||
obj = pdf.getobj(objid)
|
try:
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
# make sure all of the requirements are in there
|
# make sure all of the requirements are in there
|
||||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||||
better_content = data
|
better_content = data
|
||||||
|
|
||||||
# remove the date
|
|
||||||
startpos = better_content.find("This content downloaded ")
|
|
||||||
endpos = better_content.find(")", startpos)
|
|
||||||
segment = better_content[startpos:endpos]
|
|
||||||
if verbose >= 2 and replacements:
|
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
|
||||||
|
|
||||||
better_content = better_content.replace(segment, "")
|
|
||||||
|
|
||||||
# it looks like all of the watermarks are at the end?
|
|
||||||
better_content = better_content[:-160]
|
|
||||||
|
|
||||||
# "Accessed on dd/mm/yyy hh:mm"
|
|
||||||
#
|
|
||||||
# the "Accessed" line is only on the first page
|
|
||||||
#
|
|
||||||
# it's based on /F2
|
|
||||||
#
|
|
||||||
# This would be better if it could be decoded to
|
|
||||||
# actually search for the "Accessed" text.
|
|
||||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
|
||||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
|
||||||
endpos = better_content.find("Tf\n", startpos+5)
|
|
||||||
|
|
||||||
|
# remove the date
|
||||||
|
startpos = better_content.find("This content downloaded ")
|
||||||
|
endpos = better_content.find(")", startpos)
|
||||||
|
segment = better_content[startpos:endpos]
|
||||||
if verbose >= 2 and replacements:
|
if verbose >= 2 and replacements:
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||||
|
|
||||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
better_content = better_content.replace(segment, "")
|
||||||
|
|
||||||
replacements.append([objid, better_content])
|
# it looks like all of the watermarks are at the end?
|
||||||
|
better_content = better_content[:-160]
|
||||||
|
|
||||||
page_id += 1
|
# "Accessed on dd/mm/yyy hh:mm"
|
||||||
|
#
|
||||||
|
# the "Accessed" line is only on the first page
|
||||||
|
#
|
||||||
|
# it's based on /F2
|
||||||
|
#
|
||||||
|
# This would be better if it could be decoded to
|
||||||
|
# actually search for the "Accessed" text.
|
||||||
|
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||||
|
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||||
|
endpos = better_content.find("Tf\n", startpos+5)
|
||||||
|
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||||
|
|
||||||
|
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||||
|
|
||||||
|
replacements.append([objid, better_content])
|
||||||
|
|
||||||
|
page_id += 1
|
||||||
|
except PDFObjectNotFound, e:
|
||||||
|
print >>sys.stderr, 'Missing object: %r' % e
|
||||||
|
|
||||||
if verbose >= 1 and replacements:
|
if verbose >= 1 and replacements:
|
||||||
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||||
|
@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -27,8 +27,7 @@ class ScienceMagazine(Plugin):
|
|||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -1 +1 @@
|
|||||||
pdfminer>=0
|
pdfminer>=20131113
|
||||||
|
2
setup.py
2
setup.py
@ -11,7 +11,7 @@ long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).re
|
|||||||
if platform.python_version() >= "3.0.0":
|
if platform.python_version() >= "3.0.0":
|
||||||
dependencies = ["pdfminer3k>=1.3.0"]
|
dependencies = ["pdfminer3k>=1.3.0"]
|
||||||
else:
|
else:
|
||||||
dependencies = ["pdfminer>=0"]
|
dependencies = ["pdfminer>=20131113"]
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
"pdfparanoia",
|
"pdfparanoia",
|
||||||
|
Loading…
Reference in New Issue
Block a user