1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-12-04 15:05:52 +01:00

initial commit

This commit is contained in:
Bryan Bishop 2013-02-05 03:10:14 -06:00
commit d8fc6c1d8f
15 changed files with 380 additions and 0 deletions

12
.gitignore vendored Normal file
View File

@ -0,0 +1,12 @@
# ignore editor leftovers
.*.sw*
.*~
*~
# ignore precompiled python files
*.pyc
# ignore setup stuff
build/
dist/
pdfparanoia.egg-info/

17
Makefile Normal file
View File

@ -0,0 +1,17 @@
SHELL := /bin/bash
test:
nosetests-2.7 -s --verbosity=2
clean:
rm -fr build/
rm -fr dist/
rm -fr *.egg-info
find . -type f -name "*.pyc" -exec rm '{}' \;
install:
python setup.py install
upload:
python setup.py sdist upload

37
README.md Normal file
View File

@ -0,0 +1,37 @@
# pdfparanoia
pdfparanoia is a PDF watermark remover library for academic papers.
## Installing
Simple.
``` bash
sudo pip install pdfparanoia
```
or,
``` bash
sudo python setup.py install
```
## Usage
``` python
import pdfparanoia
pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
file_handler = open("output.pdf", "wb")
file_handler.write(pdf)
file_handler.close()
```
## Changelog
* 0.0.1 - initial commit
## License
BSD.

27
pdfparanoia/__init__.py Normal file
View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia - pdf watermark remover library for academic papers
~~~~~~~~~~~~~~~
pdfparanoia is a pdf watermark remover library for academic papers. Basic
usage:
>>> import pdfparanoia
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
>>> file_handler = open("output.pdf", "w")
>>> file_handler.write(pdf)
>>> file_handler.close()
:copyright: (c) 2013 by Bryan Bishop.
:license: BSD.
"""
__title__ = "pdfparanoia"
__version__ = "0.0.1"
__build__ = 0x000001
__author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop"
from . import utils
from .core import scrub

51
pdfparanoia/core.py Normal file
View File

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.core
~~~~~~~~~~~~~~~
This module provides most of the heavy lifting of pdfparanoia.
"""
import sys
import inspect
from .parser import parse_pdf
from .plugin import Plugin
from .plugins import *
def find_plugins():
"""
Returns a list of all compatible plugins.
"""
def inspection(thing):
iswanted = inspect.isclass(thing)
iswanted = iswanted and issubclass(thing, Plugin)
iswanted = iswanted and thing is not Plugin
return iswanted
plugins = inspect.getmembers(sys.modules[__name__], inspection)
plugins = [each[1] for each in plugins]
return plugins
def scrub(obj):
"""
Removes watermarks from a pdf and returns the resulting pdf as a string.
"""
# reset the file handler
if hasattr(obj, "seek"):
obj.seek(0)
else:
obj = open(obj, "rb")
# load up the raw bytes
content = obj.read()
# get a list of plugins that will manipulate this paper
plugins = find_plugins()
# clean this pdf as much as possible
for plugin in plugins:
content = plugin.scrub(content)
return content

33
pdfparanoia/eraser.py Normal file
View File

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.eraser
~~~~~~~~~~~~~~~
Tools to erase things from pdfs by direct manipulation of the pdf format.
"""
def remove_object_by_id(content, objid):
"""
Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
"""
outlines = []
lines = content.split("\n")
last_line = None
skip_mode = False
for line in lines:
if not skip_mode:
if last_line in ["endobj", None]:
if line[-3:] == "obj":
if line.startswith(str(objid) + " "):
skip_mode = True
last_line = line
continue
outlines.append(line)
elif skip_mode:
if line == "endobj":
skip_mode = False
last_line = line
output = "\n".join(outlines)
return output

45
pdfparanoia/parser.py Normal file
View File

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.parser
~~~~~~~~~~~~~~~
Deals with the existential nature of parsing pdfs.
"""
from StringIO import StringIO
# Maybe one day pdfquery will be able to save pdf.
# from pdfquery import PDFQuery
from pdfminer.pdfparser import (
PDFParser,
PDFDocument,
)
def parse_pdf(handler):
"""
Parses a PDF via pdfminer.
"""
# reset to the beginning of the data
handler.seek(0)
# setup for parsing
parser = PDFParser(handler)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
# actual parsing
doc.initialize()
return doc
def parse_content(content):
"""
Parses a PDF via pdfminer from a string. There are some problems with
pdfminer accepting StringIO objects, so this is a temporary hack.
"""
stream = StringIO(content)
return parse_pdf(stream)

17
pdfparanoia/plugin.py Normal file
View File

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.plugin
~~~~~~~~~~~~~~~
Defines how plugins work.
"""
class Plugin:
@staticmethod
def scrub(content):
"""
Removes watermarks from the given pdf.
"""
raise NotImplementedError("must be implemented by the subclass")

View File

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.plugins
~~~~~~~~~~~~~~~
Scrubbing machines. Bubbles mandatory.
"""
from .aip import *

View File

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
from copy import copy
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class AmericanInstituteOfPhysics(Plugin):
"""
American Institute of Physics
~~~~~~~~~~~~~~~
These watermarks are pretty basic, but sometimes they don't have indexes
attached for whatever reason.
"""
@staticmethod
def scrub(content):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
#length = obj.attrs["Length"]
#rawdata = copy(obj.rawdata)
data = copy(obj.get_data())
if "Redistribution subject to AIP license or copyright" in data:
evil_ids.append(objid)
for objid in evil_ids:
content = remove_object_by_id(content, objid)
return content

9
pdfparanoia/utils.py Normal file
View File

@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
"""
pdfparanoia.utils
~~~~~~~~~~~~~~~
This module provides utility functions used both in pdfparanoia and that are
also useful for external consumption.
"""

29
setup.py Normal file
View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from setuptools import setup
setup(
name="pdfparanoia",
version="0.0.1",
url="https://github.com/kanzure/pdfparanoia",
license="BSD",
author="Bryan Bishop",
author_email="kanzure@gmail.com",
description="pdf watermark remover library for academic papers",
long_description=open("README.md", "r").read(),
packages=["pdfparanoia"],
zip_safe=False,
include_package_data=True,
install_requires=["pdfminer>=0", "pdfquery>=0"],
platforms="any",
classifiers=[
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python",
#"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.1",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
]
)

Binary file not shown.

16
tests/test_aip.py Normal file
View File

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
import unittest
import pdfparanoia
class AmericanInstituteOfPhysicsTestCase(unittest.TestCase):
def test_aip(self):
file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb")
content = file_handler.read()
self.assertIn("\n4 0 obj\n", content)
self.assertIn("\n10 0 obj\n", content)
output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content)
self.assertNotIn("\n4 0 obj\n", output)
self.assertNotIn("\n10 0 obj\n", output)

27
tests/test_eraser.py Normal file
View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
import unittest
from pdfparanoia.eraser import remove_object_by_id
class EraserTestCase(unittest.TestCase):
def test_remove_object_by_id(self):
content = ""
output = remove_object_by_id(content, 1)
self.assertEqual(content, output)
content = ""
output = remove_object_by_id(content, 2)
self.assertEqual(content, output)
content = ""
output = remove_object_by_id(content, 100)
self.assertEqual(content, output)
content = "1 0 obj\nthings\nendobj\nleftovers"
output = remove_object_by_id(content, 2)
self.assertEqual(content, output)
content = "1 0 obj\nthings\nendobj\nleftovers"
output = remove_object_by_id(content, 1)
self.assertEqual("leftovers", output)