mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
initial commit
This commit is contained in:
commit
d8fc6c1d8f
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# ignore editor leftovers
|
||||||
|
.*.sw*
|
||||||
|
.*~
|
||||||
|
*~
|
||||||
|
|
||||||
|
# ignore precompiled python files
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# ignore setup stuff
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
pdfparanoia.egg-info/
|
17
Makefile
Normal file
17
Makefile
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
SHELL := /bin/bash
|
||||||
|
|
||||||
|
test:
|
||||||
|
nosetests-2.7 -s --verbosity=2
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -fr build/
|
||||||
|
rm -fr dist/
|
||||||
|
rm -fr *.egg-info
|
||||||
|
find . -type f -name "*.pyc" -exec rm '{}' \;
|
||||||
|
|
||||||
|
install:
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
upload:
|
||||||
|
python setup.py sdist upload
|
||||||
|
|
37
README.md
Normal file
37
README.md
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# pdfparanoia
|
||||||
|
|
||||||
|
pdfparanoia is a PDF watermark remover library for academic papers.
|
||||||
|
|
||||||
|
## Installing
|
||||||
|
|
||||||
|
Simple.
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
sudo pip install pdfparanoia
|
||||||
|
```
|
||||||
|
|
||||||
|
or,
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
sudo python setup.py install
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
``` python
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
|
pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
|
||||||
|
|
||||||
|
file_handler = open("output.pdf", "wb")
|
||||||
|
file_handler.write(pdf)
|
||||||
|
file_handler.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Changelog
|
||||||
|
|
||||||
|
* 0.0.1 - initial commit
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
BSD.
|
27
pdfparanoia/__init__.py
Normal file
27
pdfparanoia/__init__.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia - pdf watermark remover library for academic papers
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||||
|
usage:
|
||||||
|
|
||||||
|
>>> import pdfparanoia
|
||||||
|
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
|
||||||
|
>>> file_handler = open("output.pdf", "w")
|
||||||
|
>>> file_handler.write(pdf)
|
||||||
|
>>> file_handler.close()
|
||||||
|
|
||||||
|
:copyright: (c) 2013 by Bryan Bishop.
|
||||||
|
:license: BSD.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__title__ = "pdfparanoia"
|
||||||
|
__version__ = "0.0.1"
|
||||||
|
__build__ = 0x000001
|
||||||
|
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||||
|
__license__ = "BSD"
|
||||||
|
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
from .core import scrub
|
51
pdfparanoia/core.py
Normal file
51
pdfparanoia/core.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.core
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module provides most of the heavy lifting of pdfparanoia.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
from .parser import parse_pdf
|
||||||
|
from .plugin import Plugin
|
||||||
|
from .plugins import *
|
||||||
|
|
||||||
|
def find_plugins():
|
||||||
|
"""
|
||||||
|
Returns a list of all compatible plugins.
|
||||||
|
"""
|
||||||
|
def inspection(thing):
|
||||||
|
iswanted = inspect.isclass(thing)
|
||||||
|
iswanted = iswanted and issubclass(thing, Plugin)
|
||||||
|
iswanted = iswanted and thing is not Plugin
|
||||||
|
return iswanted
|
||||||
|
plugins = inspect.getmembers(sys.modules[__name__], inspection)
|
||||||
|
plugins = [each[1] for each in plugins]
|
||||||
|
return plugins
|
||||||
|
|
||||||
|
def scrub(obj):
|
||||||
|
"""
|
||||||
|
Removes watermarks from a pdf and returns the resulting pdf as a string.
|
||||||
|
"""
|
||||||
|
# reset the file handler
|
||||||
|
if hasattr(obj, "seek"):
|
||||||
|
obj.seek(0)
|
||||||
|
else:
|
||||||
|
obj = open(obj, "rb")
|
||||||
|
|
||||||
|
# load up the raw bytes
|
||||||
|
content = obj.read()
|
||||||
|
|
||||||
|
# get a list of plugins that will manipulate this paper
|
||||||
|
plugins = find_plugins()
|
||||||
|
|
||||||
|
# clean this pdf as much as possible
|
||||||
|
for plugin in plugins:
|
||||||
|
content = plugin.scrub(content)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
33
pdfparanoia/eraser.py
Normal file
33
pdfparanoia/eraser.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.eraser
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Tools to erase things from pdfs by direct manipulation of the pdf format.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def remove_object_by_id(content, objid):
|
||||||
|
"""
|
||||||
|
Deletes an object from a pdf. Mostly streams and FlateDecode stuff.
|
||||||
|
"""
|
||||||
|
outlines = []
|
||||||
|
lines = content.split("\n")
|
||||||
|
last_line = None
|
||||||
|
skip_mode = False
|
||||||
|
for line in lines:
|
||||||
|
if not skip_mode:
|
||||||
|
if last_line in ["endobj", None]:
|
||||||
|
if line[-3:] == "obj":
|
||||||
|
if line.startswith(str(objid) + " "):
|
||||||
|
skip_mode = True
|
||||||
|
last_line = line
|
||||||
|
continue
|
||||||
|
outlines.append(line)
|
||||||
|
elif skip_mode:
|
||||||
|
if line == "endobj":
|
||||||
|
skip_mode = False
|
||||||
|
last_line = line
|
||||||
|
output = "\n".join(outlines)
|
||||||
|
return output
|
||||||
|
|
45
pdfparanoia/parser.py
Normal file
45
pdfparanoia/parser.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.parser
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Deals with the existential nature of parsing pdfs.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
# Maybe one day pdfquery will be able to save pdf.
|
||||||
|
# from pdfquery import PDFQuery
|
||||||
|
|
||||||
|
from pdfminer.pdfparser import (
|
||||||
|
PDFParser,
|
||||||
|
PDFDocument,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_pdf(handler):
|
||||||
|
"""
|
||||||
|
Parses a PDF via pdfminer.
|
||||||
|
"""
|
||||||
|
# reset to the beginning of the data
|
||||||
|
handler.seek(0)
|
||||||
|
|
||||||
|
# setup for parsing
|
||||||
|
parser = PDFParser(handler)
|
||||||
|
doc = PDFDocument()
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
|
||||||
|
# actual parsing
|
||||||
|
doc.initialize()
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def parse_content(content):
|
||||||
|
"""
|
||||||
|
Parses a PDF via pdfminer from a string. There are some problems with
|
||||||
|
pdfminer accepting StringIO objects, so this is a temporary hack.
|
||||||
|
"""
|
||||||
|
stream = StringIO(content)
|
||||||
|
return parse_pdf(stream)
|
||||||
|
|
17
pdfparanoia/plugin.py
Normal file
17
pdfparanoia/plugin.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.plugin
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Defines how plugins work.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Plugin:
|
||||||
|
@staticmethod
|
||||||
|
def scrub(content):
|
||||||
|
"""
|
||||||
|
Removes watermarks from the given pdf.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("must be implemented by the subclass")
|
||||||
|
|
11
pdfparanoia/plugins/__init__.py
Normal file
11
pdfparanoia/plugins/__init__.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.plugins
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Scrubbing machines. Bubbles mandatory.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .aip import *
|
||||||
|
|
49
pdfparanoia/plugins/aip.py
Normal file
49
pdfparanoia/plugins/aip.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from copy import copy
|
||||||
|
|
||||||
|
from ..parser import parse_content
|
||||||
|
from ..eraser import remove_object_by_id
|
||||||
|
from ..plugin import Plugin
|
||||||
|
|
||||||
|
class AmericanInstituteOfPhysics(Plugin):
|
||||||
|
"""
|
||||||
|
American Institute of Physics
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
These watermarks are pretty basic, but sometimes they don't have indexes
|
||||||
|
attached for whatever reason.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def scrub(content):
|
||||||
|
evil_ids = []
|
||||||
|
|
||||||
|
# parse the pdf into a pdfminer document
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
|
# get a list of all object ids
|
||||||
|
xrefs = pdf._parser.read_xref()
|
||||||
|
xref = xrefs[0]
|
||||||
|
objids = xref.get_objids()
|
||||||
|
|
||||||
|
# check each object in the pdf
|
||||||
|
for objid in objids:
|
||||||
|
# get an object by id
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
|
if hasattr(obj, "attrs"):
|
||||||
|
# watermarks tend to be in FlateDecode elements
|
||||||
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
|
#length = obj.attrs["Length"]
|
||||||
|
#rawdata = copy(obj.rawdata)
|
||||||
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
|
if "Redistribution subject to AIP license or copyright" in data:
|
||||||
|
evil_ids.append(objid)
|
||||||
|
|
||||||
|
for objid in evil_ids:
|
||||||
|
content = remove_object_by_id(content, objid)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
9
pdfparanoia/utils.py
Normal file
9
pdfparanoia/utils.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia.utils
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
This module provides utility functions used both in pdfparanoia and that are
|
||||||
|
also useful for external consumption.
|
||||||
|
"""
|
||||||
|
|
29
setup.py
Normal file
29
setup.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="pdfparanoia",
|
||||||
|
version="0.0.1",
|
||||||
|
url="https://github.com/kanzure/pdfparanoia",
|
||||||
|
license="BSD",
|
||||||
|
author="Bryan Bishop",
|
||||||
|
author_email="kanzure@gmail.com",
|
||||||
|
description="pdf watermark remover library for academic papers",
|
||||||
|
long_description=open("README.md", "r").read(),
|
||||||
|
packages=["pdfparanoia"],
|
||||||
|
zip_safe=False,
|
||||||
|
include_package_data=True,
|
||||||
|
install_requires=["pdfminer>=0", "pdfquery>=0"],
|
||||||
|
platforms="any",
|
||||||
|
classifiers=[
|
||||||
|
"License :: OSI Approved :: BSD License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Programming Language :: Python",
|
||||||
|
#"Programming Language :: Python :: 2.6",
|
||||||
|
"Programming Language :: Python :: 2.7",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.1",
|
||||||
|
"Programming Language :: Python :: 3.2",
|
||||||
|
"Programming Language :: Python :: 3.3",
|
||||||
|
]
|
||||||
|
)
|
BIN
tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
Normal file
BIN
tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf
Normal file
Binary file not shown.
16
tests/test_aip.py
Normal file
16
tests/test_aip.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
|
class AmericanInstituteOfPhysicsTestCase(unittest.TestCase):
|
||||||
|
def test_aip(self):
|
||||||
|
file_handler = open("tests/samples/aip/a7132c0d62d7c00e92e8e0553f480556.pdf", "rb")
|
||||||
|
content = file_handler.read()
|
||||||
|
self.assertIn("\n4 0 obj\n", content)
|
||||||
|
self.assertIn("\n10 0 obj\n", content)
|
||||||
|
|
||||||
|
output = pdfparanoia.plugins.AmericanInstituteOfPhysics.scrub(content)
|
||||||
|
self.assertNotIn("\n4 0 obj\n", output)
|
||||||
|
self.assertNotIn("\n10 0 obj\n", output)
|
||||||
|
|
27
tests/test_eraser.py
Normal file
27
tests/test_eraser.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from pdfparanoia.eraser import remove_object_by_id
|
||||||
|
|
||||||
|
class EraserTestCase(unittest.TestCase):
|
||||||
|
def test_remove_object_by_id(self):
|
||||||
|
content = ""
|
||||||
|
output = remove_object_by_id(content, 1)
|
||||||
|
self.assertEqual(content, output)
|
||||||
|
|
||||||
|
content = ""
|
||||||
|
output = remove_object_by_id(content, 2)
|
||||||
|
self.assertEqual(content, output)
|
||||||
|
|
||||||
|
content = ""
|
||||||
|
output = remove_object_by_id(content, 100)
|
||||||
|
self.assertEqual(content, output)
|
||||||
|
|
||||||
|
content = "1 0 obj\nthings\nendobj\nleftovers"
|
||||||
|
output = remove_object_by_id(content, 2)
|
||||||
|
self.assertEqual(content, output)
|
||||||
|
|
||||||
|
content = "1 0 obj\nthings\nendobj\nleftovers"
|
||||||
|
output = remove_object_by_id(content, 1)
|
||||||
|
self.assertEqual("leftovers", output)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user