commit d8fc6c1d8fa2594e6c976ea13463eb80cfaed673 Author: Bryan Bishop Date: Tue Feb 5 03:10:14 2013 -0600 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..23f078d --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# ignore editor leftovers +.*.sw* +.*~ +*~ + +# ignore precompiled python files +*.pyc + +# ignore setup stuff +build/ +dist/ +pdfparanoia.egg-info/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e3de402 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +SHELL := /bin/bash + +test: + nosetests-2.7 -s --verbosity=2 + +clean: + rm -fr build/ + rm -fr dist/ + rm -fr *.egg-info + find . -type f -name "*.pyc" -exec rm '{}' \; + +install: + python setup.py install + +upload: + python setup.py sdist upload + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b36aec4 --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# pdfparanoia + +pdfparanoia is a PDF watermark remover library for academic papers. + +## Installing + +Simple. + +``` bash +sudo pip install pdfparanoia +``` + +or, + +``` bash +sudo python setup.py install +``` + +## Usage + +``` python +import pdfparanoia + +pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb")) + +file_handler = open("output.pdf", "wb") +file_handler.write(pdf) +file_handler.close() +``` + +## Changelog + +* 0.0.1 - initial commit + +## License + +BSD. diff --git a/pdfparanoia/__init__.py b/pdfparanoia/__init__.py new file mode 100644 index 0000000..1e71732 --- /dev/null +++ b/pdfparanoia/__init__.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia - pdf watermark remover library for academic papers +~~~~~~~~~~~~~~~ + +pdfparanoia is a pdf watermark remover library for academic papers. Basic +usage: + + >>> import pdfparanoia + >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r")) + >>> file_handler = open("output.pdf", "w") + >>> file_handler.write(pdf) + >>> file_handler.close() + +:copyright: (c) 2013 by Bryan Bishop. +:license: BSD. +""" + +__title__ = "pdfparanoia" +__version__ = "0.0.1" +__build__ = 0x000001 +__author__ = "Bryan Bishop " +__license__ = "BSD" +__copyright__ = "Copyright 2013 Bryan Bishop" + +from . import utils +from .core import scrub diff --git a/pdfparanoia/core.py b/pdfparanoia/core.py new file mode 100644 index 0000000..a877ba0 --- /dev/null +++ b/pdfparanoia/core.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.core +~~~~~~~~~~~~~~~ + +This module provides most of the heavy lifting of pdfparanoia. + +""" + +import sys +import inspect + +from .parser import parse_pdf +from .plugin import Plugin +from .plugins import * + +def find_plugins(): + """ + Returns a list of all compatible plugins. + """ + def inspection(thing): + iswanted = inspect.isclass(thing) + iswanted = iswanted and issubclass(thing, Plugin) + iswanted = iswanted and thing is not Plugin + return iswanted + plugins = inspect.getmembers(sys.modules[__name__], inspection) + plugins = [each[1] for each in plugins] + return plugins + +def scrub(obj): + """ + Removes watermarks from a pdf and returns the resulting pdf as a string. + """ + # reset the file handler + if hasattr(obj, "seek"): + obj.seek(0) + else: + obj = open(obj, "rb") + + # load up the raw bytes + content = obj.read() + + # get a list of plugins that will manipulate this paper + plugins = find_plugins() + + # clean this pdf as much as possible + for plugin in plugins: + content = plugin.scrub(content) + + return content + diff --git a/pdfparanoia/eraser.py b/pdfparanoia/eraser.py new file mode 100644 index 0000000..dd3de24 --- /dev/null +++ b/pdfparanoia/eraser.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.eraser +~~~~~~~~~~~~~~~ + +Tools to erase things from pdfs by direct manipulation of the pdf format. + +""" + +def remove_object_by_id(content, objid): + """ + Deletes an object from a pdf. Mostly streams and FlateDecode stuff. + """ + outlines = [] + lines = content.split("\n") + last_line = None + skip_mode = False + for line in lines: + if not skip_mode: + if last_line in ["endobj", None]: + if line[-3:] == "obj": + if line.startswith(str(objid) + " "): + skip_mode = True + last_line = line + continue + outlines.append(line) + elif skip_mode: + if line == "endobj": + skip_mode = False + last_line = line + output = "\n".join(outlines) + return output + diff --git a/pdfparanoia/parser.py b/pdfparanoia/parser.py new file mode 100644 index 0000000..4874fab --- /dev/null +++ b/pdfparanoia/parser.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.parser +~~~~~~~~~~~~~~~ + +Deals with the existential nature of parsing pdfs. + +""" + +from StringIO import StringIO + +# Maybe one day pdfquery will be able to save pdf. +# from pdfquery import PDFQuery + +from pdfminer.pdfparser import ( + PDFParser, + PDFDocument, +) + +def parse_pdf(handler): + """ + Parses a PDF via pdfminer. + """ + # reset to the beginning of the data + handler.seek(0) + + # setup for parsing + parser = PDFParser(handler) + doc = PDFDocument() + parser.set_document(doc) + doc.set_parser(parser) + + # actual parsing + doc.initialize() + + return doc + +def parse_content(content): + """ + Parses a PDF via pdfminer from a string. There are some problems with + pdfminer accepting StringIO objects, so this is a temporary hack. + """ + stream = StringIO(content) + return parse_pdf(stream) + diff --git a/pdfparanoia/plugin.py b/pdfparanoia/plugin.py new file mode 100644 index 0000000..867ee48 --- /dev/null +++ b/pdfparanoia/plugin.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.plugin +~~~~~~~~~~~~~~~ + +Defines how plugins work. + +""" + +class Plugin: + @staticmethod + def scrub(content): + """ + Removes watermarks from the given pdf. + """ + raise NotImplementedError("must be implemented by the subclass") + diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py new file mode 100644 index 0000000..b6c05b3 --- /dev/null +++ b/pdfparanoia/plugins/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.plugins +~~~~~~~~~~~~~~~ + +Scrubbing machines. Bubbles mandatory. + +""" + +from .aip import * + diff --git a/pdfparanoia/plugins/aip.py b/pdfparanoia/plugins/aip.py new file mode 100644 index 0000000..0ae3ec1 --- /dev/null +++ b/pdfparanoia/plugins/aip.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from copy import copy + +from ..parser import parse_content +from ..eraser import remove_object_by_id +from ..plugin import Plugin + +class AmericanInstituteOfPhysics(Plugin): + """ + American Institute of Physics + ~~~~~~~~~~~~~~~ + + These watermarks are pretty basic, but sometimes they don't have indexes + attached for whatever reason. + """ + + @staticmethod + def scrub(content): + evil_ids = [] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + # watermarks tend to be in FlateDecode elements + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + #length = obj.attrs["Length"] + #rawdata = copy(obj.rawdata) + data = copy(obj.get_data()) + + if "Redistribution subject to AIP license or copyright" in data: + evil_ids.append(objid) + + for objid in evil_ids: + content = remove_object_by_id(content, objid) + + return content + diff --git a/pdfparanoia/utils.py b/pdfparanoia/utils.py new file mode 100644 index 0000000..596d5f6 --- /dev/null +++ b/pdfparanoia/utils.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +""" +pdfparanoia.utils +~~~~~~~~~~~~~~~ + +This module provides utility functions used both in pdfparanoia and that are +also useful for external consumption. +""" + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ff11fd8 --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +from setuptools import setup + +setup( + name="pdfparanoia", + version="0.0.1", + url="https://github.com/kanzure/pdfparanoia", + license="BSD", + author="Bryan Bishop", + author_email="kanzure@gmail.com", + description="pdf watermark remover library for academic papers", + long_description=open("README.md", "r").read(), + packages=["pdfparanoia"], + zip_safe=False, + include_package_data=True, + install_requires=["pdfminer>=0", "pdfquery>=0"], + platforms="any", + classifiers=[ + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + #"Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.1", + "Programming Language :: Python :: 3.2", + "Programming Language :: Python :: 3.3", + ] +) diff --git a/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf b/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf new file mode 100644 index 0000000..b268202 Binary files /dev/null and b/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf differ diff --git a/tests/test_aip.py b/tests/test_aip.py new file mode 100644 index 0000000..06105ee --- /dev/null +++ b/tests/test_aip.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + +import unittest +import pdfparanoia + +class AmericanInstituteOfPhysicsTestCase(unittest.TestCase): + def test_aip(self): + file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb") + content = file_handler.read() + self.assertIn("\n4 0 obj\n", content) + self.assertIn("\n10 0 obj\n", content) + + output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content) + self.assertNotIn("\n4 0 obj\n", output) + self.assertNotIn("\n10 0 obj\n", output) + diff --git a/tests/test_eraser.py b/tests/test_eraser.py new file mode 100644 index 0000000..63de2af --- /dev/null +++ b/tests/test_eraser.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +import unittest +from pdfparanoia.eraser import remove_object_by_id + +class EraserTestCase(unittest.TestCase): + def test_remove_object_by_id(self): + content = "" + output = remove_object_by_id(content, 1) + self.assertEqual(content, output) + + content = "" + output = remove_object_by_id(content, 2) + self.assertEqual(content, output) + + content = "" + output = remove_object_by_id(content, 100) + self.assertEqual(content, output) + + content = "1 0 obj\nthings\nendobj\nleftovers" + output = remove_object_by_id(content, 2) + self.assertEqual(content, output) + + content = "1 0 obj\nthings\nendobj\nleftovers" + output = remove_object_by_id(content, 1) + self.assertEqual("leftovers", output) +