mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 15:05:52 +01:00
ieee watermark removal
This commit is contained in:
parent
0adec6c74e
commit
14f1439c76
@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||
usage:
|
||||
|
||||
>>> import pdfparanoia
|
||||
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
|
||||
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
|
||||
>>> file_handler = open("output.pdf", "w")
|
||||
>>> file_handler.write(pdf)
|
||||
>>> file_handler.close()
|
||||
@ -17,8 +17,8 @@ usage:
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.7"
|
||||
__build__ = 0x000007
|
||||
__version__ = "0.0.8"
|
||||
__build__ = 0x000008
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
@ -18,7 +18,7 @@ def remove_object_by_id(content, objid):
|
||||
for line in lines:
|
||||
if not skip_mode:
|
||||
if last_line in ["endobj", None]:
|
||||
if line[-3:] == "obj":
|
||||
if line[-3:] == "obj" or " obj<<" in line[0:50]:
|
||||
if line.startswith(str(objid) + " "):
|
||||
skip_mode = True
|
||||
last_line = line
|
||||
|
@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory.
|
||||
"""
|
||||
|
||||
from .aip import *
|
||||
from .ieee import *
|
||||
|
||||
|
48
pdfparanoia/plugins/ieee.py
Normal file
48
pdfparanoia/plugins/ieee.py
Normal file
@ -0,0 +1,48 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class IEEEXplore(Plugin):
|
||||
"""
|
||||
IEEE Xplore
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
#length = obj.attrs["Length"]
|
||||
#rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Authorized licensed use limited to: " in data:
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
print "evil id: " + str(objid)
|
||||
content = remove_object_by_id(content, objid)
|
||||
|
||||
return content
|
||||
|
BIN
tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
Normal file
BIN
tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
Normal file
Binary file not shown.
20
tests/test_ieee.py
Normal file
20
tests/test_ieee.py
Normal file
@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
import pdfparanoia
|
||||
|
||||
class IEEEXploreTestCase(unittest.TestCase):
|
||||
def test_ieee(self):
|
||||
file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
self.assertIn("\n4 0 obj", content)
|
||||
self.assertIn("\n7 0 obj", content)
|
||||
|
||||
output = pdfparanoia.plugins.IEEEXplore.scrub(content)
|
||||
self.assertNotIn("\n19 0 obj", output)
|
||||
self.assertNotIn("\n37 0 obj", output)
|
||||
self.assertNotIn("\n43 0 obj", output)
|
||||
self.assertNotIn("\n53 0 obj", output)
|
||||
self.assertNotIn("\n64 0 obj", output)
|
||||
self.assertNotIn("\n73 0 obj", output)
|
||||
|
Loading…
Reference in New Issue
Block a user