mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
Merge pull request #26 from DonnchaC/rsc
Watermark removal for Royal Society of Chemistry.
This commit is contained in:
commit
404e3577e0
@ -47,10 +47,12 @@ cat input.pdf | pdfparanoia > output.pdf
|
|||||||
* AIP
|
* AIP
|
||||||
* IEEE
|
* IEEE
|
||||||
* JSTOR
|
* JSTOR
|
||||||
|
* RSC
|
||||||
* SPIE (sort of)
|
* SPIE (sort of)
|
||||||
|
|
||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
|
* 0.0.13 - RSC
|
||||||
* 0.0.12 - SPIE
|
* 0.0.12 - SPIE
|
||||||
* 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
|
* 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
|
||||||
* 0.0.10 - JSTOR
|
* 0.0.10 - JSTOR
|
||||||
|
@ -17,8 +17,8 @@ usage:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
__title__ = "pdfparanoia"
|
__title__ = "pdfparanoia"
|
||||||
__version__ = "0.0.12"
|
__version__ = "0.0.13"
|
||||||
__build__ = 0x000012
|
__build__ = 0x000013
|
||||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||||
__license__ = "BSD"
|
__license__ = "BSD"
|
||||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||||
|
@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
|
|||||||
from .aip import *
|
from .aip import *
|
||||||
from .ieee import *
|
from .ieee import *
|
||||||
from .jstor import *
|
from .jstor import *
|
||||||
|
from .rsc import *
|
||||||
|
|
||||||
|
74
pdfparanoia/plugins/rsc.py
Normal file
74
pdfparanoia/plugins/rsc.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from copy import copy
|
||||||
|
import sys
|
||||||
|
from ..parser import parse_content
|
||||||
|
from ..plugin import Plugin
|
||||||
|
import base64
|
||||||
|
|
||||||
|
class RoyalSocietyOfChemistry(Plugin):
|
||||||
|
"""
|
||||||
|
RoyalSocietyOfChemistry
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
RSC watermarks each PDF with a "Downloaded" date and the name
|
||||||
|
of the institution from which the PDF was downloaded.
|
||||||
|
|
||||||
|
Watermarks removed:
|
||||||
|
* "Downloaded by" watermark and timestamp on the each page
|
||||||
|
* "Published on" watermark on the side of each page
|
||||||
|
|
||||||
|
This was primary written for RSC PDF's from http://pubs.rsc.org
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrub(cls, content, verbose=0):
|
||||||
|
replacements = []
|
||||||
|
|
||||||
|
# List of watermark strings to remove
|
||||||
|
watermarks = [
|
||||||
|
"Downloaded by ",
|
||||||
|
"Downloaded on ",
|
||||||
|
"Published on ",
|
||||||
|
#"View Article Online",
|
||||||
|
#"Journal Homepage",
|
||||||
|
#"Table of Contents for this issue",
|
||||||
|
]
|
||||||
|
|
||||||
|
# parse the pdf into a pdfminer document
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
|
# get a list of all object ids
|
||||||
|
xrefs = pdf._parser.read_xref()
|
||||||
|
xref = xrefs[0]
|
||||||
|
objids = xref.get_objids()
|
||||||
|
|
||||||
|
# check each object in the pdf
|
||||||
|
for objid in objids:
|
||||||
|
# get an object by id
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
|
if hasattr(obj, "attrs"):
|
||||||
|
# watermarks tend to be in FlateDecode elements
|
||||||
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
|
rawdata = copy(obj.rawdata)
|
||||||
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
|
# Check if any of the watermarks are in the current object
|
||||||
|
for phrase in watermarks:
|
||||||
|
if phrase in data:
|
||||||
|
if verbose >= 2:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||||
|
elif verbose >= 1:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase))
|
||||||
|
|
||||||
|
# We had a match so replace the watermark data with an empty string
|
||||||
|
replacements.append([rawdata, ""])
|
||||||
|
|
||||||
|
for deets in replacements:
|
||||||
|
# Directly replace the stream data in binary encoded object
|
||||||
|
content = content.replace( deets[0], deets[1])
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
BIN
tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf
Normal file
BIN
tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf
Normal file
Binary file not shown.
20
tests/test_rsc.py
Normal file
20
tests/test_rsc.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
|
class RoyalSocietyOfChemistryTestCase(unittest.TestCase):
|
||||||
|
def test_rsc(self):
|
||||||
|
file_handler = open("tests/samples/rsc/3589bf649f8bb019bd97be9880627b7c.pdf", "rb")
|
||||||
|
content = file_handler.read()
|
||||||
|
file_handler.close()
|
||||||
|
|
||||||
|
# Check the PDF is from the RSC
|
||||||
|
self.assertIn("pubs.rsc.org", content)
|
||||||
|
|
||||||
|
output = pdfparanoia.plugins.RoyalSocietyOfChemistry.scrub(content)
|
||||||
|
|
||||||
|
# Check the PDF was output correctly and still
|
||||||
|
# contains the RSC url.
|
||||||
|
self.assertIn("pubs.rsc.org", output)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user