mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 23:15:52 +01:00
ieee watermark removal
This commit is contained in:
parent
0adec6c74e
commit
14f1439c76
@ -7,7 +7,7 @@ pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
|||||||
usage:
|
usage:
|
||||||
|
|
||||||
>>> import pdfparanoia
|
>>> import pdfparanoia
|
||||||
>>> pdf = pdfparanoia.scrub(open("nmat91417.pdf", "r"))
|
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
|
||||||
>>> file_handler = open("output.pdf", "w")
|
>>> file_handler = open("output.pdf", "w")
|
||||||
>>> file_handler.write(pdf)
|
>>> file_handler.write(pdf)
|
||||||
>>> file_handler.close()
|
>>> file_handler.close()
|
||||||
@ -17,8 +17,8 @@ usage:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
__title__ = "pdfparanoia"
|
__title__ = "pdfparanoia"
|
||||||
__version__ = "0.0.7"
|
__version__ = "0.0.8"
|
||||||
__build__ = 0x000007
|
__build__ = 0x000008
|
||||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||||
__license__ = "BSD"
|
__license__ = "BSD"
|
||||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||||
|
@ -18,7 +18,7 @@ def remove_object_by_id(content, objid):
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
if not skip_mode:
|
if not skip_mode:
|
||||||
if last_line in ["endobj", None]:
|
if last_line in ["endobj", None]:
|
||||||
if line[-3:] == "obj":
|
if line[-3:] == "obj" or " obj<<" in line[0:50]:
|
||||||
if line.startswith(str(objid) + " "):
|
if line.startswith(str(objid) + " "):
|
||||||
skip_mode = True
|
skip_mode = True
|
||||||
last_line = line
|
last_line = line
|
||||||
|
@ -8,4 +8,5 @@ Scrubbing machines. Bubbles mandatory.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from .aip import *
|
from .aip import *
|
||||||
|
from .ieee import *
|
||||||
|
|
||||||
|
48
pdfparanoia/plugins/ieee.py
Normal file
48
pdfparanoia/plugins/ieee.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from copy import copy
|
||||||
|
|
||||||
|
from ..parser import parse_content
|
||||||
|
from ..eraser import remove_object_by_id
|
||||||
|
from ..plugin import Plugin
|
||||||
|
|
||||||
|
class IEEEXplore(Plugin):
|
||||||
|
"""
|
||||||
|
IEEE Xplore
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def scrub(content):
|
||||||
|
evil_ids = []
|
||||||
|
|
||||||
|
# parse the pdf into a pdfminer document
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
|
# get a list of all object ids
|
||||||
|
xrefs = pdf._parser.read_xref()
|
||||||
|
xref = xrefs[0]
|
||||||
|
objids = xref.get_objids()
|
||||||
|
|
||||||
|
# check each object in the pdf
|
||||||
|
for objid in objids:
|
||||||
|
# get an object by id
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
|
if hasattr(obj, "attrs"):
|
||||||
|
# watermarks tend to be in FlateDecode elements
|
||||||
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
|
#length = obj.attrs["Length"]
|
||||||
|
#rawdata = copy(obj.rawdata)
|
||||||
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
|
if "Authorized licensed use limited to: " in data:
|
||||||
|
evil_ids.append(objid)
|
||||||
|
|
||||||
|
for objid in evil_ids:
|
||||||
|
print "evil id: " + str(objid)
|
||||||
|
content = remove_object_by_id(content, objid)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
BIN
tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
Normal file
BIN
tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf
Normal file
Binary file not shown.
20
tests/test_ieee.py
Normal file
20
tests/test_ieee.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
|
class IEEEXploreTestCase(unittest.TestCase):
|
||||||
|
def test_ieee(self):
|
||||||
|
file_handler = open("tests/samples/ieee/9984106e01b63d996f19f383b8d96f02.pdf", "rb")
|
||||||
|
content = file_handler.read()
|
||||||
|
self.assertIn("\n4 0 obj", content)
|
||||||
|
self.assertIn("\n7 0 obj", content)
|
||||||
|
|
||||||
|
output = pdfparanoia.plugins.IEEEXplore.scrub(content)
|
||||||
|
self.assertNotIn("\n19 0 obj", output)
|
||||||
|
self.assertNotIn("\n37 0 obj", output)
|
||||||
|
self.assertNotIn("\n43 0 obj", output)
|
||||||
|
self.assertNotIn("\n53 0 obj", output)
|
||||||
|
self.assertNotIn("\n64 0 obj", output)
|
||||||
|
self.assertNotIn("\n73 0 obj", output)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user