jstor watermark removal

fixes #1
This commit is contained in:
Bryan Bishop 2013-02-06 17:31:19 -06:00
parent 47bc734318
commit b7b5a4ef65
6 changed files with 128 additions and 4 deletions

View File

@ -28,11 +28,17 @@ file_handler.write(pdf)
file_handler.close()
```
## Supported
* AIP
* IEEE
* JSTOR
## Changelog
* 0.0.10 - JSTOR
* 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.
* 0.0.8 - ieee support
* 0.0.1 - initial commit
* 0.0.8 - IEEE
## License

View File

@ -17,8 +17,8 @@ usage:
"""
__title__ = "pdfparanoia"
__version__ = "0.0.9"
__build__ = 0x000009
__version__ = "0.0.10"
__build__ = 0x000010
__author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop"

View File

@ -9,4 +9,5 @@ Scrubbing machines. Bubbles mandatory.
from .aip import *
from .ieee import *
from .jstor import *

View File

@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
from copy import copy
from ..parser import parse_content
from ..eraser import (
remove_object_by_id,
replace_object_with,
)
from ..plugin import Plugin
class JSTOR(Plugin):
"""
JSTOR
~~~~~~~~~~~~~~~
JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and
then also a watermark at the bottom of each page with a timestamp.
Watermarks removed:
* "Accessed" timestamp on the front page
* footer watermarks on each page
This was primary written for JSTOR pdfs generated by:
/Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))
"""
# these terms appear on a page that has been watermarked
requirements = [
"All use subject to ",
"JSTOR Terms and Conditions",
"This content downloaded on",
]
@staticmethod
def scrub(content):
replacements = []
# jstor has certain watermarks only on the first page
page_id = 0
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
length = obj.attrs["Length"]
rawdata = copy(obj.rawdata)
data = copy(obj.get_data())
# make sure all of the requirements are in there
if all([requirement in data for requirement in JSTOR.requirements]):
better_content = data
# remove the date
startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos]
better_content = better_content.replace(segment, "")
# it looks like all of the watermarks are at the end?
better_content = better_content[:-160]
# "Accessed on dd/mm/yyy hh:mm"
#
# the "Accessed" line is only on the first page
#
# it's based on /F2
#
# This would be better if it could be decoded to
# actually search for the "Accessed" text.
if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5)
better_content = better_content[0:startpos] + better_content[endpos:]
replacements.append([objid, better_content])
page_id += 1
for deets in replacements:
objid = deets[0]
replacement = deets[1]
content = replace_object_with(content, objid, replacement)
return content

20
tests/test_jstor.py Normal file
View File

@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
import unittest
import pdfparanoia
class JSTORTestCase(unittest.TestCase):
def test_jstor(self):
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
content = file_handler.read()
file_handler.close()
self.assertIn("\n18 0 obj \n", content)
# this section will later be manipulated
self.assertIn("\n19 0 obj \n", content)
output = pdfparanoia.plugins.JSTOR.scrub(content)
# FlateDecode should be replaced with a decompressed section
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)