mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
parent
47bc734318
commit
b7b5a4ef65
10
README.md
10
README.md
@ -28,11 +28,17 @@ file_handler.write(pdf)
|
||||
file_handler.close()
|
||||
```
|
||||
|
||||
## Supported
|
||||
|
||||
* AIP
|
||||
* IEEE
|
||||
* JSTOR
|
||||
|
||||
## Changelog
|
||||
|
||||
* 0.0.10 - JSTOR
|
||||
* 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.
|
||||
* 0.0.8 - ieee support
|
||||
* 0.0.1 - initial commit
|
||||
* 0.0.8 - IEEE
|
||||
|
||||
## License
|
||||
|
||||
|
@ -17,8 +17,8 @@ usage:
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.9"
|
||||
__build__ = 0x000009
|
||||
__version__ = "0.0.10"
|
||||
__build__ = 0x000010
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
@ -9,4 +9,5 @@ Scrubbing machines. Bubbles mandatory.
|
||||
|
||||
from .aip import *
|
||||
from .ieee import *
|
||||
from .jstor import *
|
||||
|
||||
|
97
pdfparanoia/plugins/jstor.py
Normal file
97
pdfparanoia/plugins/jstor.py
Normal file
@ -0,0 +1,97 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import (
|
||||
remove_object_by_id,
|
||||
replace_object_with,
|
||||
)
|
||||
from ..plugin import Plugin
|
||||
|
||||
class JSTOR(Plugin):
|
||||
"""
|
||||
JSTOR
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and
|
||||
then also a watermark at the bottom of each page with a timestamp.
|
||||
|
||||
Watermarks removed:
|
||||
* "Accessed" timestamp on the front page
|
||||
* footer watermarks on each page
|
||||
|
||||
This was primary written for JSTOR pdfs generated by:
|
||||
/Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))
|
||||
"""
|
||||
|
||||
# these terms appear on a page that has been watermarked
|
||||
requirements = [
|
||||
"All use subject to ",
|
||||
"JSTOR Terms and Conditions",
|
||||
"This content downloaded on",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
replacements = []
|
||||
|
||||
# jstor has certain watermarks only on the first page
|
||||
page_id = 0
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
length = obj.attrs["Length"]
|
||||
rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
# make sure all of the requirements are in there
|
||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||
better_content = data
|
||||
|
||||
# remove the date
|
||||
startpos = better_content.find("This content downloaded ")
|
||||
endpos = better_content.find(")", startpos)
|
||||
segment = better_content[startpos:endpos]
|
||||
better_content = better_content.replace(segment, "")
|
||||
|
||||
# it looks like all of the watermarks are at the end?
|
||||
better_content = better_content[:-160]
|
||||
|
||||
# "Accessed on dd/mm/yyy hh:mm"
|
||||
#
|
||||
# the "Accessed" line is only on the first page
|
||||
#
|
||||
# it's based on /F2
|
||||
#
|
||||
# This would be better if it could be decoded to
|
||||
# actually search for the "Accessed" text.
|
||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||
endpos = better_content.find("Tf\n", startpos+5)
|
||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||
|
||||
replacements.append([objid, better_content])
|
||||
|
||||
page_id += 1
|
||||
|
||||
for deets in replacements:
|
||||
objid = deets[0]
|
||||
replacement = deets[1]
|
||||
content = replace_object_with(content, objid, replacement)
|
||||
|
||||
return content
|
||||
|
BIN
tests/samples/jstor/231a515256115368c142f528cee7f727.pdf
Normal file
BIN
tests/samples/jstor/231a515256115368c142f528cee7f727.pdf
Normal file
Binary file not shown.
20
tests/test_jstor.py
Normal file
20
tests/test_jstor.py
Normal file
@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
import pdfparanoia
|
||||
|
||||
class JSTORTestCase(unittest.TestCase):
|
||||
def test_jstor(self):
|
||||
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
file_handler.close()
|
||||
self.assertIn("\n18 0 obj \n", content)
|
||||
|
||||
# this section will later be manipulated
|
||||
self.assertIn("\n19 0 obj \n", content)
|
||||
|
||||
output = pdfparanoia.plugins.JSTOR.scrub(content)
|
||||
|
||||
# FlateDecode should be replaced with a decompressed section
|
||||
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)
|
||||
|
Loading…
Reference in New Issue
Block a user