diff --git a/pdfparanoia/__init__.py b/pdfparanoia/__init__.py index c004fd3..9fda1b8 100644 --- a/pdfparanoia/__init__.py +++ b/pdfparanoia/__init__.py @@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic usage: >>> import pdfparanoia - >>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r")) + >>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r")) >>> file_handler = open("output.pdf", "w") >>> file_handler.write(pdf) >>> file_handler.close() @@ -17,8 +17,8 @@ usage: """ __title__ = "pdfparanoia" -__version__ = "0.0.7" -__build__ = 0x000007 +__version__ = "0.0.8" +__build__ = 0x000008 __author__ = "Bryan Bishop " __license__ = "BSD" __copyright__ = "Copyright 2013 Bryan Bishop" diff --git a/pdfparanoia/eraser.py b/pdfparanoia/eraser.py index dd3de24..fae200e 100644 --- a/pdfparanoia/eraser.py +++ b/pdfparanoia/eraser.py @@ -18,7 +18,7 @@ def remove_object_by_id(content, objid): for line in lines: if not skip_mode: if last_line in ["endobj", None]: - if line[-3:] == "obj": + if line[-3:] == "obj" or " obj<<" in line[0:50]: if line.startswith(str(objid) + " "): skip_mode = True last_line = line diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index b6c05b3..4b4686b 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory. """ from .aip import * +from .ieee import * diff --git a/pdfparanoia/plugins/ieee.py b/pdfparanoia/plugins/ieee.py new file mode 100644 index 0000000..1587445 --- /dev/null +++ b/pdfparanoia/plugins/ieee.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +from copy import copy + +from ..parser import parse_content +from ..eraser import remove_object_by_id +from ..plugin import Plugin + +class IEEEXplore(Plugin): + """ + IEEE Xplore + ~~~~~~~~~~~~~~~ + + """ + + @staticmethod + def scrub(content): + evil_ids = [] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + # watermarks tend to be in FlateDecode elements + if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": + #length = obj.attrs["Length"] + #rawdata = copy(obj.rawdata) + data = copy(obj.get_data()) + + if "Authorized licensed use limited to: " in data: + evil_ids.append(objid) + + for objid in evil_ids: + print "evil id: " + str(objid) + content = remove_object_by_id(content, objid) + + return content + diff --git a/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf b/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf new file mode 100644 index 0000000..77169a4 Binary files /dev/null and b/tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf differ diff --git a/tests/test_ieee.py b/tests/test_ieee.py new file mode 100644 index 0000000..0e565c5 --- /dev/null +++ b/tests/test_ieee.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- + +import unittest +import pdfparanoia + +class IEEEXploreTestCase(unittest.TestCase): + def test_ieee(self): + file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb") + content = file_handler.read() + self.assertIn("\n4 0 obj", content) + self.assertIn("\n7 0 obj", content) + + output = pdfparanoia.plugins.IEEEXplore.scrub(content) + self.assertNotIn("\n19 0 obj", output) + self.assertNotIn("\n37 0 obj", output) + self.assertNotIn("\n43 0 obj", output) + self.assertNotIn("\n53 0 obj", output) + self.assertNotIn("\n64 0 obj", output) + self.assertNotIn("\n73 0 obj", output) +