diff --git a/README.md b/README.md index d9f7334..6ca769d 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,11 @@ cat input.pdf | pdfparanoia > output.pdf * AIP * IEEE * JSTOR +* SPIE (sort of) ## Changelog +* 0.0.12 - SPIE * 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot. * 0.0.10 - JSTOR * 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage. diff --git a/pdfparanoia/__init__.py b/pdfparanoia/__init__.py index 19d0f0e..8194a4a 100644 --- a/pdfparanoia/__init__.py +++ b/pdfparanoia/__init__.py @@ -17,8 +17,8 @@ usage: """ __title__ = "pdfparanoia" -__version__ = "0.0.11" -__build__ = 0x000011 +__version__ = "0.0.12" +__build__ = 0x000012 __author__ = "Bryan Bishop " __license__ = "BSD" __copyright__ = "Copyright 2013 Bryan Bishop" diff --git a/pdfparanoia/eraser.py b/pdfparanoia/eraser.py index 3e35c1f..8b32704 100644 --- a/pdfparanoia/eraser.py +++ b/pdfparanoia/eraser.py @@ -19,6 +19,9 @@ def manipulate_pdf(content, objid, callback, *args): last_line = None skip_mode = False for line in lines: + if line == "": + outlines.append("") + continue if not skip_mode: if last_line in ["endobj", "endobj ", None]: if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]: diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 10179eb..93a425b 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory. from .aip import * from .ieee import * from .jstor import * +from .spie import * diff --git a/pdfparanoia/plugins/spie.py b/pdfparanoia/plugins/spie.py new file mode 100644 index 0000000..2f8ea2e --- /dev/null +++ b/pdfparanoia/plugins/spie.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from copy import copy + +from ..parser import parse_content +from ..eraser import remove_object_by_id +from ..plugin import Plugin + +class SPIE(Plugin): + """ + Society of Photo-Optical Instrumentation Engineers + ~~~~~~~~~~~~~~~ + + These watermarks are shown on each page, but are only defined in one place. + Also, there seems to be some interference from some of the other + pdfparanoia plugins causing the deletion of images in the document. + Side-effects need to be better accounted for. + + """ + + @staticmethod + def scrub(content): + evil_ids = [] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + # watermarks tend to be in FlateDecode elements + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + data = copy(obj.get_data()) + + if "Downloaded From:" in data: + evil_ids.append(objid) + + for objid in evil_ids: + # for some reason SPIE pdfs are broken by this, images are randomly removed + #content = remove_object_by_id(content, objid) + continue + + return content + diff --git a/tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf b/tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf new file mode 100644 index 0000000..df31ca8 Binary files /dev/null and b/tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf differ diff --git a/tests/test_spie.py b/tests/test_spie.py new file mode 100644 index 0000000..06119a0 --- /dev/null +++ b/tests/test_spie.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +import unittest +import pdfparanoia + +class SPIETestCase(unittest.TestCase): + def test_spie(self): + file_handler = open("tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf", "rb") + content = file_handler.read() + self.assertIn("\n46 0 obj", content) + + output = pdfparanoia.plugins.SPIE.scrub(content) + self.assertNotIn("\n55 0 obj", output) +