initial commit

2025-07-04 20:37:38 +02:00 · 2013-02-05 03:10:14 -06:00 · 2013-02-05 03:10:14 -06:00 · d8fc6c1d8f
commit d8fc6c1d8f
15 changed files with 380 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,12 @@
+# ignore editor leftovers
+.*.sw*
+.*~
+*~
+
+# ignore precompiled python files
+*.pyc
+
+# ignore setup stuff
+build/
+dist/
+pdfparanoia.egg-info/
--- a/17
+++ b/17
@ -0,0 +1,17 @@
+SHELL := /bin/bash
+
+test:
+	nosetests-2.7 -s --verbosity=2
+
+clean:
+	rm -fr build/
+	rm -fr dist/
+	rm -fr *.egg-info
+	find . -type f -name "*.pyc" -exec rm '{}' \;
+
+install:
+	python setup.py install
+
+upload:
+	python setup.py sdist upload
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
+# pdfparanoia
+
+pdfparanoia is a PDF watermark remover library for academic papers.
+
+## Installing
+
+Simple.
+
+``` bash
+sudo pip install pdfparanoia
+```
+
+or,
+
+``` bash
+sudo python setup.py install
+```
+
+## Usage
+
+``` python
+import pdfparanoia
+
+pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
+
+file_handler = open("output.pdf", "wb")
+file_handler.write(pdf)
+file_handler.close()
+```
+
+## Changelog
+
+* 0.0.1 - initial commit
+
+## License
+
+BSD.
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia - pdf watermark remover library for academic papers
+~~~~~~~~~~~~~~~
+
+pdfparanoia is a pdf watermark remover library for academic papers. Basic
+usage:
+
+    >>> import pdfparanoia
+    >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
+    >>> file_handler = open("output.pdf", "w")
+    >>> file_handler.write(pdf)
+    >>> file_handler.close()
+
+:copyright: (c) 2013 by Bryan Bishop.
+:license: BSD.
+"""
+
+__title__ = "pdfparanoia"
+__version__ = "0.0.1"
+__build__ = 0x000001
+__author__ = "Bryan Bishop <kanzure@gmail.com>"
+__license__ = "BSD"
+__copyright__ = "Copyright 2013 Bryan Bishop"
+
+from . import utils
+from .core import scrub
--- a/pdfparanoia/core.py
+++ b/pdfparanoia/core.py
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.core
+~~~~~~~~~~~~~~~
+
+This module provides most of the heavy lifting of pdfparanoia.
+
+"""
+
+import sys
+import inspect
+
+from .parser import parse_pdf
+from .plugin import Plugin
+from .plugins import *
+
+def find_plugins():
+    """
+    Returns a list of all compatible plugins.
+    """
+    def inspection(thing):
+        iswanted = inspect.isclass(thing)
+        iswanted = iswanted and issubclass(thing, Plugin)
+        iswanted = iswanted and thing is not Plugin
+        return iswanted
+    plugins = inspect.getmembers(sys.modules[__name__], inspection)
+    plugins = [each[1] for each in plugins]
+    return plugins
+
+def scrub(obj):
+    """
+    Removes watermarks from a pdf and returns the resulting pdf as a string.
+    """
+    # reset the file handler
+    if hasattr(obj, "seek"):
+        obj.seek(0)
+    else:
+        obj = open(obj, "rb")
+
+    # load up the raw bytes
+    content = obj.read()
+
+    # get a list of plugins that will manipulate this paper
+    plugins = find_plugins()
+
+    # clean this pdf as much as possible
+    for plugin in plugins:
+        content = plugin.scrub(content)
+
+    return content
+
--- a/pdfparanoia/eraser.py
+++ b/pdfparanoia/eraser.py
@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.eraser
+~~~~~~~~~~~~~~~
+
+Tools to erase things from pdfs by direct manipulation of the pdf format.
+
+"""
+
+def remove_object_by_id(content, objid):
+    """
+    Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
+    """
+    outlines = []
+    lines = content.split("\n")
+    last_line = None
+    skip_mode = False
+    for line in lines:
+        if not skip_mode:
+            if last_line in ["endobj", None]:
+                if line[-3:] == "obj":
+                    if line.startswith(str(objid) + " "):
+                        skip_mode = True
+                        last_line = line
+                        continue
+            outlines.append(line)
+        elif skip_mode:
+            if line == "endobj":
+                skip_mode = False
+        last_line = line
+    output = "\n".join(outlines)
+    return output
+
--- a/pdfparanoia/parser.py
+++ b/pdfparanoia/parser.py
@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.parser
+~~~~~~~~~~~~~~~
+
+Deals with the existential nature of parsing pdfs.
+
+"""
+
+from StringIO import StringIO
+
+# Maybe one day pdfquery will be able to save pdf.
+# from pdfquery import PDFQuery
+
+from pdfminer.pdfparser import (
+    PDFParser,
+    PDFDocument,
+)
+
+def parse_pdf(handler):
+    """
+    Parses a PDF via pdfminer.
+    """
+    # reset to the beginning of the data
+    handler.seek(0)
+
+    # setup for parsing
+    parser = PDFParser(handler)
+    doc = PDFDocument()
+    parser.set_document(doc)
+    doc.set_parser(parser)
+
+    # actual parsing
+    doc.initialize()
+
+    return doc
+
+def parse_content(content):
+    """
+    Parses a PDF via pdfminer from a string. There are some problems with
+    pdfminer accepting StringIO objects, so this is a temporary hack.
+    """
+    stream = StringIO(content)
+    return parse_pdf(stream)
+
--- a/pdfparanoia/plugin.py
+++ b/pdfparanoia/plugin.py
@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.plugin
+~~~~~~~~~~~~~~~
+
+Defines how plugins work.
+
+"""
+
+class Plugin:
+    @staticmethod
+    def scrub(content):
+        """
+        Removes watermarks from the given pdf.
+        """
+        raise NotImplementedError("must be implemented by the subclass")
+
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.plugins
+~~~~~~~~~~~~~~~
+
+Scrubbing machines. Bubbles mandatory.
+
+"""
+
+from .aip import *
+
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+from copy import copy
+
+from ..parser import parse_content
+from ..eraser import remove_object_by_id
+from ..plugin import Plugin
+
+class AmericanInstituteOfPhysics(Plugin):
+    """
+    American Institute of Physics
+    ~~~~~~~~~~~~~~~
+
+    These watermarks are pretty basic, but sometimes they don't have indexes
+    attached for whatever reason.
+    """
+
+    @staticmethod
+    def scrub(content):
+        evil_ids = []
+
+        # parse the pdf into a pdfminer document
+        pdf = parse_content(content)
+
+        # get a list of all object ids
+        xrefs = pdf._parser.read_xref()
+        xref = xrefs[0]
+        objids = xref.get_objids()
+
+        # check each object in the pdf
+        for objid in objids:
+            # get an object by id
+            obj = pdf.getobj(objid)
+
+            if hasattr(obj, "attrs"):
+                # watermarks tend to be in FlateDecode elements
+                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    #length = obj.attrs["Length"]
+                    #rawdata = copy(obj.rawdata)
+                    data = copy(obj.get_data())
+
+                    if "Redistribution subject to AIP license or copyright" in data:
+                        evil_ids.append(objid)
+
+        for objid in evil_ids:
+            content = remove_object_by_id(content, objid)
+
+        return content
+
--- a/pdfparanoia/utils.py
+++ b/pdfparanoia/utils.py
@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+pdfparanoia.utils
+~~~~~~~~~~~~~~~
+
+This module provides utility functions used both in pdfparanoia and that are
+also useful for external consumption.
+"""
+
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+from setuptools import setup
+
+setup(
+    name="pdfparanoia",
+    version="0.0.1",
+    url="https://github.com/kanzure/pdfparanoia",
+    license="BSD",
+    author="Bryan Bishop",
+    author_email="kanzure@gmail.com",
+    description="pdf watermark remover library for academic papers",
+    long_description=open("README.md", "r").read(),
+    packages=["pdfparanoia"],
+    zip_safe=False,
+    include_package_data=True,
+    install_requires=["pdfminer>=0", "pdfquery>=0"],
+    platforms="any",
+    classifiers=[
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python",
+        #"Programming Language :: Python :: 2.6",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.1",
+        "Programming Language :: Python :: 3.2",
+        "Programming Language :: Python :: 3.3",
+    ]
+)
--- a/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
+++ b/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
--- a/tests/test_aip.py
+++ b/tests/test_aip.py
@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import pdfparanoia
+
+class AmericanInstituteOfPhysicsTestCase(unittest.TestCase):
+    def test_aip(self):
+        file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb")
+        content = file_handler.read()
+        self.assertIn("\n4 0 obj\n", content)
+        self.assertIn("\n10 0 obj\n", content)
+
+        output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content)
+        self.assertNotIn("\n4 0 obj\n", output)
+        self.assertNotIn("\n10 0 obj\n", output)
+
--- a/tests/test_eraser.py
+++ b/tests/test_eraser.py
@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+from pdfparanoia.eraser import remove_object_by_id
+
+class EraserTestCase(unittest.TestCase):
+    def test_remove_object_by_id(self):
+        content = ""
+        output = remove_object_by_id(content, 1)
+        self.assertEqual(content, output)
+
+        content = ""
+        output = remove_object_by_id(content, 2)
+        self.assertEqual(content, output)
+
+        content = ""
+        output = remove_object_by_id(content, 100)
+        self.assertEqual(content, output)
+
+        content = "1 0 obj\nthings\nendobj\nleftovers"
+        output = remove_object_by_id(content, 2)
+        self.assertEqual(content, output)
+
+        content = "1 0 obj\nthings\nendobj\nleftovers"
+        output = remove_object_by_id(content, 1)
+        self.assertEqual("leftovers", output)
+