initial commit

2025-07-01 19:18:00 +02:00 · 2013-02-05 03:10:14 -06:00 · 2013-02-05 03:10:14 -06:00 · d8fc6c1d8f
commit d8fc6c1d8f
15 changed files with 380 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,12 @@
 # ignore editor leftovers
 .*.sw*
 .*~
 *~
 # ignore precompiled python files
 *.pyc
 # ignore setup stuff
 build/
 dist/
 pdfparanoia.egg-info/
--- a/17
+++ b/17
@ -0,0 +1,17 @@
 SHELL := /bin/bash
 test:
 	nosetests-2.7 -s --verbosity=2
 clean:
 	rm -fr build/
 	rm -fr dist/
 	rm -fr *.egg-info
 	find . -type f -name "*.pyc" -exec rm '{}' \;
 install:
 	python setup.py install
 upload:
 	python setup.py sdist upload
--- a/README.md
+++ b/README.md
@ -0,0 +1,37 @@
 # pdfparanoia
 pdfparanoia is a PDF watermark remover library for academic papers.
 ## Installing
 Simple.
 ``` bash
 sudo pip install pdfparanoia
 ```
 or,
 ``` bash
 sudo python setup.py install
 ```
 ## Usage
 ``` python
 import pdfparanoia
 pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
 file_handler = open("output.pdf", "wb")
 file_handler.write(pdf)
 file_handler.close()
 ```
 ## Changelog
 * 0.0.1 - initial commit
 ## License
 BSD.
--- a/pdfparanoia/init.py
+++ b/pdfparanoia/init.py
@ -0,0 +1,27 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia - pdf watermark remover library for academic papers
 ~~~~~~~~~~~~~~~
 pdfparanoia is a pdf watermark remover library for academic papers. Basic
 usage:
    >>> import pdfparanoia
    >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
    >>> file_handler = open("output.pdf", "w")
    >>> file_handler.write(pdf)
    >>> file_handler.close()
 :copyright: (c) 2013 by Bryan Bishop.
 :license: BSD.
 """
 __title__ = "pdfparanoia"
 __version__ = "0.0.1"
 __build__ = 0x000001
 __author__ = "Bryan Bishop <kanzure@gmail.com>"
 __license__ = "BSD"
 __copyright__ = "Copyright 2013 Bryan Bishop"
 from . import utils
 from .core import scrub
--- a/pdfparanoia/core.py
+++ b/pdfparanoia/core.py
@ -0,0 +1,51 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.core
 ~~~~~~~~~~~~~~~
 This module provides most of the heavy lifting of pdfparanoia.
 """
 import sys
 import inspect
 from .parser import parse_pdf
 from .plugin import Plugin
 from .plugins import *
 def find_plugins():
    """
    Returns a list of all compatible plugins.
    """
    def inspection(thing):
        iswanted = inspect.isclass(thing)
        iswanted = iswanted and issubclass(thing, Plugin)
        iswanted = iswanted and thing is not Plugin
        return iswanted
    plugins = inspect.getmembers(sys.modules[__name__], inspection)
    plugins = [each[1] for each in plugins]
    return plugins
 def scrub(obj):
    """
    Removes watermarks from a pdf and returns the resulting pdf as a string.
    """
    # reset the file handler
    if hasattr(obj, "seek"):
        obj.seek(0)
    else:
        obj = open(obj, "rb")
    # load up the raw bytes
    content = obj.read()
    # get a list of plugins that will manipulate this paper
    plugins = find_plugins()
    # clean this pdf as much as possible
    for plugin in plugins:
        content = plugin.scrub(content)
    return content
--- a/pdfparanoia/eraser.py
+++ b/pdfparanoia/eraser.py
@ -0,0 +1,33 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.eraser
 ~~~~~~~~~~~~~~~
 Tools to erase things from pdfs by direct manipulation of the pdf format.
 """
 def remove_object_by_id(content, objid):
    """
    Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
    """
    outlines = []
    lines = content.split("\n")
    last_line = None
    skip_mode = False
    for line in lines:
        if not skip_mode:
            if last_line in ["endobj", None]:
                if line[-3:] == "obj":
                    if line.startswith(str(objid) + " "):
                        skip_mode = True
                        last_line = line
                        continue
            outlines.append(line)
        elif skip_mode:
            if line == "endobj":
                skip_mode = False
        last_line = line
    output = "\n".join(outlines)
    return output
--- a/pdfparanoia/parser.py
+++ b/pdfparanoia/parser.py
@ -0,0 +1,45 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.parser
 ~~~~~~~~~~~~~~~
 Deals with the existential nature of parsing pdfs.
 """
 from StringIO import StringIO
 # Maybe one day pdfquery will be able to save pdf.
 # from pdfquery import PDFQuery
 from pdfminer.pdfparser import (
    PDFParser,
    PDFDocument,
 )
 def parse_pdf(handler):
    """
    Parses a PDF via pdfminer.
    """
    # reset to the beginning of the data
    handler.seek(0)
    # setup for parsing
    parser = PDFParser(handler)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    # actual parsing
    doc.initialize()
    return doc
 def parse_content(content):
    """
    Parses a PDF via pdfminer from a string. There are some problems with
    pdfminer accepting StringIO objects, so this is a temporary hack.
    """
    stream = StringIO(content)
    return parse_pdf(stream)
--- a/pdfparanoia/plugin.py
+++ b/pdfparanoia/plugin.py
@ -0,0 +1,17 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.plugin
 ~~~~~~~~~~~~~~~
 Defines how plugins work.
 """
 class Plugin:
    @staticmethod
    def scrub(content):
        """
        Removes watermarks from the given pdf.
        """
        raise NotImplementedError("must be implemented by the subclass")
--- a/pdfparanoia/plugins/init.py
+++ b/pdfparanoia/plugins/init.py
@ -0,0 +1,11 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.plugins
 ~~~~~~~~~~~~~~~
 Scrubbing machines. Bubbles mandatory.
 """
 from .aip import *
--- a/pdfparanoia/plugins/aip.py
+++ b/pdfparanoia/plugins/aip.py
@ -0,0 +1,49 @@
 # -*- coding: utf-8 -*-
 from copy import copy
 from ..parser import parse_content
 from ..eraser import remove_object_by_id
 from ..plugin import Plugin
 class AmericanInstituteOfPhysics(Plugin):
    """
    American Institute of Physics
    ~~~~~~~~~~~~~~~
    These watermarks are pretty basic, but sometimes they don't have indexes
    attached for whatever reason.
    """
    @staticmethod
    def scrub(content):
        evil_ids = []
        # parse the pdf into a pdfminer document
        pdf = parse_content(content)
        # get a list of all object ids
        xrefs = pdf._parser.read_xref()
        xref = xrefs[0]
        objids = xref.get_objids()
        # check each object in the pdf
        for objid in objids:
            # get an object by id
            obj = pdf.getobj(objid)
            if hasattr(obj, "attrs"):
                # watermarks tend to be in FlateDecode elements
                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                    #length = obj.attrs["Length"]
                    #rawdata = copy(obj.rawdata)
                    data = copy(obj.get_data())
                    if "Redistribution subject to AIP license or copyright" in data:
                        evil_ids.append(objid)
        for objid in evil_ids:
            content = remove_object_by_id(content, objid)
        return content
--- a/pdfparanoia/utils.py
+++ b/pdfparanoia/utils.py
@ -0,0 +1,9 @@
 # -*- coding: utf-8 -*-
 """
 pdfparanoia.utils
 ~~~~~~~~~~~~~~~
 This module provides utility functions used both in pdfparanoia and that are
 also useful for external consumption.
 """
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,29 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup
 setup(
    name="pdfparanoia",
    version="0.0.1",
    url="https://github.com/kanzure/pdfparanoia",
    license="BSD",
    author="Bryan Bishop",
    author_email="kanzure@gmail.com",
    description="pdf watermark remover library for academic papers",
    long_description=open("README.md", "r").read(),
    packages=["pdfparanoia"],
    zip_safe=False,
    include_package_data=True,
    install_requires=["pdfminer>=0", "pdfquery>=0"],
    platforms="any",
    classifiers=[
        "License :: OSI Approved :: BSD License",
        "Operating System :: OS Independent",
        "Programming Language :: Python",
        #"Programming Language :: Python :: 2.6",
        "Programming Language :: Python :: 2.7",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.1",
        "Programming Language :: Python :: 3.2",
        "Programming Language :: Python :: 3.3",
    ]
 )
--- a/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
+++ b/tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
--- a/tests/test_aip.py
+++ b/tests/test_aip.py
@ -0,0 +1,16 @@
 # -*- coding: utf-8 -*-
 import unittest
 import pdfparanoia
 class AmericanInstituteOfPhysicsTestCase(unittest.TestCase):
    def test_aip(self):
        file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb")
        content = file_handler.read()
        self.assertIn("\n4 0 obj\n", content)
        self.assertIn("\n10 0 obj\n", content)
        output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content)
        self.assertNotIn("\n4 0 obj\n", output)
        self.assertNotIn("\n10 0 obj\n", output)
--- a/tests/test_eraser.py
+++ b/tests/test_eraser.py
@ -0,0 +1,27 @@
 # -*- coding: utf-8 -*-
 import unittest
 from pdfparanoia.eraser import remove_object_by_id
 class EraserTestCase(unittest.TestCase):
    def test_remove_object_by_id(self):
        content = ""
        output = remove_object_by_id(content, 1)
        self.assertEqual(content, output)
        content = ""
        output = remove_object_by_id(content, 2)
        self.assertEqual(content, output)
        content = ""
        output = remove_object_by_id(content, 100)
        self.assertEqual(content, output)
        content = "1 0 obj\nthings\nendobj\nleftovers"
        output = remove_object_by_id(content, 2)
        self.assertEqual(content, output)
        content = "1 0 obj\nthings\nendobj\nleftovers"
        output = remove_object_by_id(content, 1)
        self.assertEqual("leftovers", output)