1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-12-04 23:15:52 +01:00

ieee watermark removal

This commit is contained in:
Bryan Bishop 2013-02-05 04:49:56 -06:00
parent 0adec6c74e
commit 14f1439c76
6 changed files with 73 additions and 4 deletions

View File

@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic
usage: usage:
>>> import pdfparanoia >>> import pdfparanoia
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r")) >>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
>>> file_handler = open("output.pdf", "w") >>> file_handler = open("output.pdf", "w")
>>> file_handler.write(pdf) >>> file_handler.write(pdf)
>>> file_handler.close() >>> file_handler.close()
@ -17,8 +17,8 @@ usage:
""" """
__title__ = "pdfparanoia" __title__ = "pdfparanoia"
__version__ = "0.0.7" __version__ = "0.0.8"
__build__ = 0x000007 __build__ = 0x000008
__author__ = "Bryan Bishop <kanzure@gmail.com>" __author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD" __license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop" __copyright__ = "Copyright 2013 Bryan Bishop"

View File

@ -18,7 +18,7 @@ def remove_object_by_id(content, objid):
for line in lines: for line in lines:
if not skip_mode: if not skip_mode:
if last_line in ["endobj", None]: if last_line in ["endobj", None]:
if line[-3:] == "obj": if line[-3:] == "obj" or " obj<<" in line[0:50]:
if line.startswith(str(objid) + " "): if line.startswith(str(objid) + " "):
skip_mode = True skip_mode = True
last_line = line last_line = line

View File

@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory.
""" """
from .aip import * from .aip import *
from .ieee import *

View File

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from copy import copy
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class IEEEXplore(Plugin):
"""
IEEE Xplore
~~~~~~~~~~~~~~~
"""
@staticmethod
def scrub(content):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
#length = obj.attrs["Length"]
#rawdata = copy(obj.rawdata)
data = copy(obj.get_data())
if "Authorized licensed use limited to: " in data:
evil_ids.append(objid)
for objid in evil_ids:
print "evil id: " + str(objid)
content = remove_object_by_id(content, objid)
return content

20
tests/test_ieee.py Normal file
View File

@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
import unittest
import pdfparanoia
class IEEEXploreTestCase(unittest.TestCase):
def test_ieee(self):
file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb")
content = file_handler.read()
self.assertIn("\n4 0 obj", content)
self.assertIn("\n7 0 obj", content)
output = pdfparanoia.plugins.IEEEXplore.scrub(content)
self.assertNotIn("\n19 0 obj", output)
self.assertNotIn("\n37 0 obj", output)
self.assertNotIn("\n43 0 obj", output)
self.assertNotIn("\n53 0 obj", output)
self.assertNotIn("\n64 0 obj", output)
self.assertNotIn("\n73 0 obj", output)