111 lines
4.0 KiB
Python
111 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from copy import copy
|
|
|
|
import sys
|
|
|
|
from ..parser import parse_content
|
|
from ..eraser import (
|
|
replace_object_with,
|
|
)
|
|
from ..plugin import Plugin
|
|
|
|
from pdfminer.pdftypes import PDFObjectNotFound
|
|
|
|
class JSTOR(Plugin):
|
|
"""
|
|
JSTOR
|
|
~~~~~~~~~~~~~~~
|
|
|
|
JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and
|
|
then also a watermark at the bottom of each page with a timestamp.
|
|
|
|
Watermarks removed:
|
|
* "Accessed" timestamp on the front page
|
|
* footer watermarks on each page
|
|
|
|
This was primary written for JSTOR pdfs generated by:
|
|
/Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))
|
|
"""
|
|
|
|
# these terms appear on a page that has been watermarked
|
|
requirements = [
|
|
"All use subject to ",
|
|
"JSTOR Terms and Conditions",
|
|
"This content downloaded on",
|
|
]
|
|
|
|
@classmethod
|
|
def scrub(cls, content, verbose=0):
|
|
replacements = []
|
|
|
|
# jstor has certain watermarks only on the first page
|
|
page_id = 0
|
|
|
|
# parse the pdf into a pdfminer document
|
|
pdf = parse_content(content)
|
|
|
|
# get a list of all object ids
|
|
xref = pdf.xrefs[0]
|
|
objids = xref.get_objids()
|
|
|
|
# check each object in the pdf
|
|
for objid in objids:
|
|
# get an object by id
|
|
try:
|
|
obj = pdf.getobj(objid)
|
|
|
|
if hasattr(obj, "attrs"):
|
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
|
data = copy(obj.get_data())
|
|
|
|
# make sure all of the requirements are in there
|
|
if all([requirement in data for requirement in JSTOR.requirements]):
|
|
better_content = data
|
|
|
|
# remove the date
|
|
startpos = better_content.find("This content downloaded ")
|
|
endpos = better_content.find(")", startpos)
|
|
segment = better_content[startpos:endpos]
|
|
if verbose >= 2 and replacements:
|
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
|
|
|
better_content = better_content.replace(segment, "")
|
|
|
|
# it looks like all of the watermarks are at the end?
|
|
better_content = better_content[:-160]
|
|
|
|
# "Accessed on dd/mm/yyy hh:mm"
|
|
#
|
|
# the "Accessed" line is only on the first page
|
|
#
|
|
# it's based on /F2
|
|
#
|
|
# This would be better if it could be decoded to
|
|
# actually search for the "Accessed" text.
|
|
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
|
startpos = better_content.rfind("/F2 11 Tf\n")
|
|
endpos = better_content.find("Tf\n", startpos+5)
|
|
|
|
if verbose >= 2 and replacements:
|
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
|
|
|
better_content = better_content[0:startpos] + better_content[endpos:]
|
|
|
|
replacements.append([objid, better_content])
|
|
|
|
page_id += 1
|
|
except PDFObjectNotFound as e:
|
|
print >>sys.stderr, 'Missing object: %r' % e
|
|
|
|
if verbose >= 1 and replacements:
|
|
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
|
|
|
for deets in replacements:
|
|
objid = deets[0]
|
|
replacement = deets[1]
|
|
content = replace_object_with(content, objid, replacement)
|
|
|
|
return content
|
|
|