mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 15:05:52 +01:00
SPIE watermark removal
This is slightly broken because the SPIE plugin removes more than just watermarks. For some reason it seems to also remove images and large blocks of text from the paper. However, the object that is being removed is tiny. In the unit testing sample, the removed object is pdf stream 55. For now, SPIE is partially disabled until this is fixed. The problem does not originate from the other plugins. fixes #20
This commit is contained in:
parent
9d7fd1dbb6
commit
caed396870
@ -45,9 +45,11 @@ cat input.pdf | pdfparanoia > output.pdf
|
||||
* AIP
|
||||
* IEEE
|
||||
* JSTOR
|
||||
* SPIE (sort of)
|
||||
|
||||
## Changelog
|
||||
|
||||
* 0.0.12 - SPIE
|
||||
* 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
|
||||
* 0.0.10 - JSTOR
|
||||
* 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.
|
||||
|
@ -17,8 +17,8 @@ usage:
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.11"
|
||||
__build__ = 0x000011
|
||||
__version__ = "0.0.12"
|
||||
__build__ = 0x000012
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
@ -19,6 +19,9 @@ def manipulate_pdf(content, objid, callback, *args):
|
||||
last_line = None
|
||||
skip_mode = False
|
||||
for line in lines:
|
||||
if line == "":
|
||||
outlines.append("")
|
||||
continue
|
||||
if not skip_mode:
|
||||
if last_line in ["endobj", "endobj ", None]:
|
||||
if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]:
|
||||
|
@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
|
||||
from .aip import *
|
||||
from .ieee import *
|
||||
from .jstor import *
|
||||
from .spie import *
|
||||
|
||||
|
52
pdfparanoia/plugins/spie.py
Normal file
52
pdfparanoia/plugins/spie.py
Normal file
@ -0,0 +1,52 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class SPIE(Plugin):
|
||||
"""
|
||||
Society of Photo-Optical Instrumentation Engineers
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
These watermarks are shown on each page, but are only defined in one place.
|
||||
Also, there seems to be some interference from some of the other
|
||||
pdfparanoia plugins causing the deletion of images in the document.
|
||||
Side-effects need to be better accounted for.
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def scrub(content):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
data = copy(obj.get_data())
|
||||
|
||||
if "Downloaded From:" in data:
|
||||
evil_ids.append(objid)
|
||||
|
||||
for objid in evil_ids:
|
||||
# for some reason SPIE pdfs are broken by this, images are randomly removed
|
||||
#content = remove_object_by_id(content, objid)
|
||||
continue
|
||||
|
||||
return content
|
||||
|
BIN
tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf
Normal file
BIN
tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf
Normal file
Binary file not shown.
14
tests/test_spie.py
Normal file
14
tests/test_spie.py
Normal file
@ -0,0 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest
|
||||
import pdfparanoia
|
||||
|
||||
class SPIETestCase(unittest.TestCase):
|
||||
def test_spie(self):
|
||||
file_handler = open("tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
self.assertIn("\n46 0 obj", content)
|
||||
|
||||
output = pdfparanoia.plugins.SPIE.scrub(content)
|
||||
self.assertNotIn("\n55 0 obj", output)
|
||||
|
Loading…
Reference in New Issue
Block a user