1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-12-04 23:15:52 +01:00

SPIE watermark removal

This is slightly broken because the SPIE plugin removes more than just
watermarks. For some reason it seems to also remove images and large
blocks of text from the paper. However, the object that is being removed
is tiny. In the unit testing sample, the removed object is pdf stream
55.

For now, SPIE is partially disabled until this is fixed. The problem
does not originate from the other plugins.

fixes #20
This commit is contained in:
Bryan Bishop 2013-02-11 23:52:59 -06:00
parent 9d7fd1dbb6
commit caed396870
7 changed files with 74 additions and 2 deletions

View File

@ -45,9 +45,11 @@ cat input.pdf | pdfparanoia > output.pdf
* AIP * AIP
* IEEE * IEEE
* JSTOR * JSTOR
* SPIE (sort of)
## Changelog ## Changelog
* 0.0.12 - SPIE
* 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot. * 0.0.11 - pdfparanoia command-line interface. Use it by either piping in pdf data, or specifying a path to a pdf in the first argv slot.
* 0.0.10 - JSTOR * 0.0.10 - JSTOR
* 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage. * 0.0.9 - AIP: better checks for false-positives; IEEE: remove stdout garbage.

View File

@ -17,8 +17,8 @@ usage:
""" """
__title__ = "pdfparanoia" __title__ = "pdfparanoia"
__version__ = "0.0.11" __version__ = "0.0.12"
__build__ = 0x000011 __build__ = 0x000012
__author__ = "Bryan Bishop <kanzure@gmail.com>" __author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD" __license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop" __copyright__ = "Copyright 2013 Bryan Bishop"

View File

@ -19,6 +19,9 @@ def manipulate_pdf(content, objid, callback, *args):
last_line = None last_line = None
skip_mode = False skip_mode = False
for line in lines: for line in lines:
if line == "":
outlines.append("")
continue
if not skip_mode: if not skip_mode:
if last_line in ["endobj", "endobj ", None]: if last_line in ["endobj", "endobj ", None]:
if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]: if line[-3:] == "obj" or line[-4:] == "obj " or " obj <<" in line[0:50] or " obj<<" in line[0:50]:

View File

@ -10,4 +10,5 @@ Scrubbing machines. Bubbles mandatory.
from .aip import * from .aip import *
from .ieee import * from .ieee import *
from .jstor import * from .jstor import *
from .spie import *

View File

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
from copy import copy
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class SPIE(Plugin):
"""
Society of Photo-Optical Instrumentation Engineers
~~~~~~~~~~~~~~~
These watermarks are shown on each page, but are only defined in one place.
Also, there seems to be some interference from some of the other
pdfparanoia plugins causing the deletion of images in the document.
Side-effects need to be better accounted for.
"""
@staticmethod
def scrub(content):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
data = copy(obj.get_data())
if "Downloaded From:" in data:
evil_ids.append(objid)
for objid in evil_ids:
# for some reason SPIE pdfs are broken by this, images are randomly removed
#content = remove_object_by_id(content, objid)
continue
return content

14
tests/test_spie.py Normal file
View File

@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
import unittest
import pdfparanoia
class SPIETestCase(unittest.TestCase):
def test_spie(self):
file_handler = open("tests/samples/spie/266c86e6f47e39415584450f5a3af4d0.pdf", "rb")
content = file_handler.read()
self.assertIn("\n46 0 obj", content)
output = pdfparanoia.plugins.SPIE.scrub(content)
self.assertNotIn("\n55 0 obj", output)