mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 15:05:52 +01:00
initial commit
This commit is contained in:
commit
d8fc6c1d8f
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
# ignore editor leftovers
|
||||
.*.sw*
|
||||
.*~
|
||||
*~
|
||||
|
||||
# ignore precompiled python files
|
||||
*.pyc
|
||||
|
||||
# ignore setup stuff
|
||||
build/
|
||||
dist/
|
||||
pdfparanoia.egg-info/
|
17
Makefile
Normal file
17
Makefile
Normal file
@ -0,0 +1,17 @@
|
||||
SHELL := /bin/bash
|
||||
|
||||
test:
|
||||
nosetests-2.7 -s --verbosity=2
|
||||
|
||||
clean:
|
||||
rm -fr build/
|
||||
rm -fr dist/
|
||||
rm -fr *.egg-info
|
||||
find . -type f -name "*.pyc" -exec rm '{}' \;
|
||||
|
||||
install:
|
||||
python setup.py install
|
||||
|
||||
upload:
|
||||
python setup.py sdist upload
|
||||
|
37
README.md
Normal file
37
README.md
Normal file
@ -0,0 +1,37 @@
|
||||
# pdfparanoia
|
||||
|
||||
pdfparanoia is a PDF watermark remover library for academic papers.
|
||||
|
||||
## Installing
|
||||
|
||||
Simple.
|
||||
|
||||
``` bash
|
||||
sudo pip install pdfparanoia
|
||||
```
|
||||
|
||||
or,
|
||||
|
||||
``` bash
|
||||
sudo python setup.py install
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
``` python
|
||||
import pdfparanoia
|
||||
|
||||
pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
|
||||
|
||||
file_handler = open("output.pdf", "wb")
|
||||
file_handler.write(pdf)
|
||||
file_handler.close()
|
||||
```
|
||||
|
||||
## Changelog
|
||||
|
||||
* 0.0.1 - initial commit
|
||||
|
||||
## License
|
||||
|
||||
BSD.
|
27
pdfparanoia/__init__.py
Normal file
27
pdfparanoia/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia - pdf watermark remover library for academic papers
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||
usage:
|
||||
|
||||
>>> import pdfparanoia
|
||||
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
|
||||
>>> file_handler = open("output.pdf", "w")
|
||||
>>> file_handler.write(pdf)
|
||||
>>> file_handler.close()
|
||||
|
||||
:copyright: (c) 2013 by Bryan Bishop.
|
||||
:license: BSD.
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.1"
|
||||
__build__ = 0x000001
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
||||
from . import utils
|
||||
from .core import scrub
|
51
pdfparanoia/core.py
Normal file
51
pdfparanoia/core.py
Normal file
@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.core
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
This module provides most of the heavy lifting of pdfparanoia.
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
import inspect
|
||||
|
||||
from .parser import parse_pdf
|
||||
from .plugin import Plugin
|
||||
from .plugins import *
|
||||
|
||||
def find_plugins():
|
||||
"""
|
||||
Returns a list of all compatible plugins.
|
||||
"""
|
||||
def inspection(thing):
|
||||
iswanted = inspect.isclass(thing)
|
||||
iswanted = iswanted and issubclass(thing, Plugin)
|
||||
iswanted = iswanted and thing is not Plugin
|
||||
return iswanted
|
||||
plugins = inspect.getmembers(sys.modules[__name__], inspection)
|
||||
plugins = [each[1] for each in plugins]
|
||||
return plugins
|
||||
|
||||
def scrub(obj):
|
||||
"""
|
||||
Removes watermarks from a pdf and returns the resulting pdf as a string.
|
||||
"""
|
||||
# reset the file handler
|
||||
if hasattr(obj, "seek"):
|
||||
obj.seek(0)
|
||||
else:
|
||||
obj = open(obj, "rb")
|
||||
|
||||
# load up the raw bytes
|
||||
content = obj.read()
|
||||
|
||||
# get a list of plugins that will manipulate this paper
|
||||
plugins = find_plugins()
|
||||
|
||||
# clean this pdf as much as possible
|
||||
for plugin in plugins:
|
||||
content = plugin.scrub(content)
|
||||
|
||||
return content
|
||||
|
33
pdfparanoia/eraser.py
Normal file
33
pdfparanoia/eraser.py
Normal file
@ -0,0 +1,33 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.eraser
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Tools to erase things from pdfs by direct manipulation of the pdf format.
|
||||
|
||||
"""
|
||||
|
||||
def remove_object_by_id(content, objid):
|
||||
"""
|
||||
Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
|
||||
"""
|
||||
outlines = []
|
||||
lines = content.split("\n")
|
||||
last_line = None
|
||||
skip_mode = False
|
||||
for line in lines:
|
||||
if not skip_mode:
|
||||
if last_line in ["endobj", None]:
|
||||
if line[-3:] == "obj":
|
||||
if line.startswith(str(objid) + " "):
|
||||
skip_mode = True
|
||||
last_line = line
|
||||
continue
|
||||
outlines.append(line)
|
||||
elif skip_mode:
|
||||
if line == "endobj":
|
||||
skip_mode = False
|
||||
last_line = line
|
||||
output = "\n".join(outlines)
|
||||
return output
|
||||
|
45
pdfparanoia/parser.py
Normal file
45
pdfparanoia/parser.py
Normal file
@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.parser
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Deals with the existential nature of parsing pdfs.
|
||||
|
||||
"""
|
||||
|
||||
from StringIO import StringIO
|
||||
|
||||
# Maybe one day pdfquery will be able to save pdf.
|
||||
# from pdfquery import PDFQuery
|
||||
|
||||
from pdfminer.pdfparser import (
|
||||
PDFParser,
|
||||
PDFDocument,
|
||||
)
|
||||
|
||||
def parse_pdf(handler):
|
||||
"""
|
||||
Parses a PDF via pdfminer.
|
||||
"""
|
||||
# reset to the beginning of the data
|
||||
handler.seek(0)
|
||||
|
||||
# setup for parsing
|
||||
parser = PDFParser(handler)
|
||||
doc = PDFDocument()
|
||||
parser.set_document(doc)
|
||||
doc.set_parser(parser)
|
||||
|
||||
# actual parsing
|
||||
doc.initialize()
|
||||
|
||||
return doc
|
||||
|
||||
def parse_content(content):
|
||||
"""
|
||||
Parses a PDF via pdfminer from a string. There are some problems with
|
||||
pdfminer accepting StringIO objects, so this is a temporary hack.
|
||||
"""
|
||||
stream = StringIO(content)
|
||||
return parse_pdf(stream)
|
||||
|
17
pdfparanoia/plugin.py
Normal file
17
pdfparanoia/plugin.py
Normal file
@ -0,0 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.plugin
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Defines how plugins work.
|
||||
|
||||
"""
|
||||
|
||||
class Plugin:
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
"""
|
||||
Removes watermarks from the given pdf.
|
||||
"""
|
||||
raise NotImplementedError("must be implemented by the subclass")
|
||||
|
11
pdfparanoia/plugins/__init__.py
Normal file
11
pdfparanoia/plugins/__init__.py
Normal file
@ -0,0 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.plugins
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Scrubbing machines. Bubbles mandatory.
|
||||
|
||||
"""
|
||||
|
||||
from .aip import *
|
||||
|
49
pdfparanoia/plugins/aip.py
Normal file
49
pdfparanoia/plugins/aip.py
Normal file
@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class AmericanInstituteOfPhysics(Plugin):
|
||||
"""
|
||||
American Institute of Physics
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
These watermarks are pretty basic, but sometimes they don't have indexes
|
||||
attached for whatever reason.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
#length = obj.attrs["Length"]
|
||||
#rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Redistribution subject to AIP license or copyright" in data:
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
content = remove_object_by_id(content, objid)
|
||||
|
||||
return content
|
||||
|
9
pdfparanoia/utils.py
Normal file
9
pdfparanoia/utils.py
Normal file
@ -0,0 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia.utils
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
This module provides utility functions used both in pdfparanoia and that are
|
||||
also useful for external consumption.
|
||||
"""
|
||||
|
29
setup.py
Normal file
29
setup.py
Normal file
@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pdfparanoia",
|
||||
version="0.0.1",
|
||||
url="https://github.com/kanzure/pdfparanoia",
|
||||
license="BSD",
|
||||
author="Bryan Bishop",
|
||||
author_email="kanzure@gmail.com",
|
||||
description="pdf watermark remover library for academic papers",
|
||||
long_description=open("README.md", "r").read(),
|
||||
packages=["pdfparanoia"],
|
||||
zip_safe=False,
|
||||
include_package_data=True,
|
||||
install_requires=["pdfminer>=0", "pdfquery>=0"],
|
||||
platforms="any",
|
||||
classifiers=[
|
||||
"License :: OSI Approved :: BSD License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
#"Programming Language :: Python :: 2.6",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.1",
|
||||
"Programming Language :: Python :: 3.2",
|
||||
"Programming Language :: Python :: 3.3",
|
||||
]
|
||||
)
|
BIN
tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
Normal file
BIN
tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
Normal file
Binary file not shown.
16
tests/test_aip.py
Normal file
16
tests/test_aip.py
Normal file
@ -0,0 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
import pdfparanoia
|
||||
|
||||
class AmericanInstituteOfPhysicsTestCase(unittest.TestCase):
|
||||
def test_aip(self):
|
||||
file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
self.assertIn("\n4 0 obj\n", content)
|
||||
self.assertIn("\n10 0 obj\n", content)
|
||||
|
||||
output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content)
|
||||
self.assertNotIn("\n4 0 obj\n", output)
|
||||
self.assertNotIn("\n10 0 obj\n", output)
|
||||
|
27
tests/test_eraser.py
Normal file
27
tests/test_eraser.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
from pdfparanoia.eraser import remove_object_by_id
|
||||
|
||||
class EraserTestCase(unittest.TestCase):
|
||||
def test_remove_object_by_id(self):
|
||||
content = ""
|
||||
output = remove_object_by_id(content, 1)
|
||||
self.assertEqual(content, output)
|
||||
|
||||
content = ""
|
||||
output = remove_object_by_id(content, 2)
|
||||
self.assertEqual(content, output)
|
||||
|
||||
content = ""
|
||||
output = remove_object_by_id(content, 100)
|
||||
self.assertEqual(content, output)
|
||||
|
||||
content = "1 0 obj\nthings\nendobj\nleftovers"
|
||||
output = remove_object_by_id(content, 2)
|
||||
self.assertEqual(content, output)
|
||||
|
||||
content = "1 0 obj\nthings\nendobj\nleftovers"
|
||||
output = remove_object_by_id(content, 1)
|
||||
self.assertEqual("leftovers", output)
|
||||
|
Loading…
Reference in New Issue
Block a user