Adding support for PDF's from pubs.rsc.org

This commit is contained in:
Donncha O'Cearbhaill 2013-05-13 20:28:35 +01:00
parent 9d26a0aa01
commit 18140d838d
6 changed files with 99 additions and 2 deletions

View File

@ -47,10 +47,12 @@ cat input.pdf | pdfparanoia > output.pdf
* AIP
* IEEE
* JSTOR
* RSC
* SPIE (sort of)
## Changelog
* 0.0.13 - RSC
* 0.0.12 - SPIE
* 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
* 0.0.10 - JSTOR

View File

@ -17,8 +17,8 @@ usage:
"""
__title__ = "pdfparanoia"
__version__ = "0.0.12"
__build__ = 0x000012
__version__ = "0.0.13"
__build__ = 0x000013
__author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop"

View File

@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
from .aip import *
from .ieee import *
from .jstor import *
from .rsc import *

View File

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
from copy import copy
import sys
from ..parser import parse_content
from ..plugin import Plugin
import base64
class RoyalSocietyOfChemistry(Plugin):
"""
RoyalSocietyOfChemistry
~~~~~~~~~~~~~~~
RSC watermarks each PDF with a "Downloaded" date and the name
of the institution from which the PDF was downloaded.
Watermarks removed:
* "Downloaded by" watermark and timestamp on the each page
* "Published on" watermark on the side of each page
This was primary written for RSC PDF's from http://pubs.rsc.org
"""
@classmethod
def scrub(cls, content, verbose=0):
replacements = []
# List of watermark strings to remove
watermarks = [
"Downloaded by ",
"Downloaded on ",
"Published on ",
#"View Article Online",
#"Journal Homepage",
#"Table of Contents for this issue",
]
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
rawdata = copy(obj.rawdata)
data = copy(obj.get_data())
# Check if any of the watermarks are in the current object
for phrase in watermarks:
if phrase in data:
if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
elif verbose >= 1:
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
# We had a match so replace the watermark data with an empty string
replacements.append([rawdata, ""])
for deets in replacements:
# Directly replace the stream data in binary encoded object
content = content.replace( deets[0], deets[1])
return content

Binary file not shown.

20
tests/test_rsc.py Normal file
View File

@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
import unittest
import pdfparanoia
class RoyalSocietyOfChemistryTestCase(unittest.TestCase):
def test_rsc(self):
file_handler = open("tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf", "rb")
content = file_handler.read()
file_handler.close()
# Check the PDF is from the RSC
self.assertIn("pubs.rsc.org", content)
output = pdfparanoia.plugins.RoyalSocietyOfChemistry.scrub(content)
# Check the PDF was output correctly and still
# contains the RSC url.
self.assertIn("pubs.rsc.org", output)