mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-05-29 07:08:03 +02:00
Compare commits
26 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
100d202221 | ||
|
c1af289767 | ||
|
26dfe093fe | ||
|
681af5c175 | ||
|
906150e033 | ||
|
585a0ac3a4 | ||
|
5669e4e289 | ||
|
c594ff41d6 | ||
|
1a01757f44 | ||
|
5cc682e2c5 | ||
|
e95374ec04 | ||
|
95a249d8ab | ||
|
380bc289b3 | ||
|
713776af67 | ||
|
e9e0ea4467 | ||
|
61e67d2c4a | ||
|
28bf8f5825 | ||
|
1ff513389f | ||
|
cc7d14d173 | ||
|
71aaf23285 | ||
|
528eae7e46 | ||
|
59a71a7cd3 | ||
|
c3e590f22f | ||
|
f3e4b74b69 | ||
|
6030778089 | ||
|
1070605316 |
6
Makefile
6
Makefile
|
@ -1,7 +1,7 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
test:
|
||||
nosetests-2.7 -s --verbosity=2
|
||||
nosetests -s --verbosity=2
|
||||
|
||||
clean:
|
||||
rm -fr build dist
|
||||
|
@ -10,8 +10,8 @@ clean:
|
|||
find . -name *.swp -exec rm {} \;
|
||||
|
||||
install:
|
||||
python2.7 setup.py install
|
||||
python3 setup.py install
|
||||
|
||||
upload: clean
|
||||
python2.7 setup.py sdist upload
|
||||
python3 setup.py sdist upload
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ publishers include private information like institution names, personal names,
|
|||
ip addresses, timestamps and other identifying information in watermarks on
|
||||
each page.
|
||||
|
||||
pdfparania это библиотека для удаления водяных знаков из PDF файлов научных
|
||||
pdfparanoia это библиотека для удаления водяных знаков из PDF файлов научных
|
||||
статей. Некоторые издатели включают личную информацию, такую как названия
|
||||
институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки
|
||||
содержащиеся на каждой странице.
|
||||
|
@ -26,6 +26,7 @@ sudo python setup.py install
|
|||
|
||||
pdfparanoia is written for python2.7+ or python 3.
|
||||
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
|
||||
For python versions prior to Python 3, use "pdfminer" from the Python Package Index (http://pypi.python.org). For recent versions of Python, use pdfminer3k instead.
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
|
@ -12,7 +12,11 @@ if __name__ == "__main__":
|
|||
import sys
|
||||
import pdfparanoia
|
||||
import argparse
|
||||
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO, BytesIO
|
||||
|
||||
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
|
||||
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),
|
||||
|
|
|
@ -1,2 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia - pdf watermark remover library for academic papers
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||
usage:
|
||||
|
||||
>>> import pdfparanoia
|
||||
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
|
||||
>>> file_handler = open("output.pdf", "w")
|
||||
>>> file_handler.write(pdf)
|
||||
>>> file_handler.close()
|
||||
|
||||
:copyright: (c) 2013 by Bryan Bishop.
|
||||
:license: BSD.
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.17"
|
||||
__build__ = 0x000017
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
||||
from . import utils
|
||||
from .core import scrub
|
||||
from .parser import deflate
|
||||
|
||||
|
|
|
@ -7,15 +7,16 @@ Deals with the existential nature of parsing pdfs.
|
|||
|
||||
"""
|
||||
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError: # py3k
|
||||
from io import StringIO, BytesIO
|
||||
|
||||
# Maybe one day pdfquery will be able to save pdf.
|
||||
# from pdfquery import PDFQuery
|
||||
|
||||
from pdfminer.pdfparser import (
|
||||
PDFParser,
|
||||
PDFDocument,
|
||||
)
|
||||
import pdfminer.pdfparser
|
||||
import pdfminer.pdfdocument
|
||||
|
||||
from .eraser import replace_object_with
|
||||
|
||||
|
@ -27,10 +28,8 @@ def parse_pdf(handler):
|
|||
handler.seek(0)
|
||||
|
||||
# setup for parsing
|
||||
parser = PDFParser(handler)
|
||||
doc = PDFDocument()
|
||||
parser.set_document(doc)
|
||||
doc.set_parser(parser)
|
||||
parser = pdfminer.pdfparser.PDFParser(handler)
|
||||
doc = pdfminer.pdfdocument.PDFDocument(parser)
|
||||
|
||||
# actual parsing
|
||||
doc.initialize()
|
||||
|
@ -58,8 +57,7 @@ def deflate(content):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# store new replacements
|
||||
|
|
|
@ -11,4 +11,4 @@ from .aip import *
|
|||
from .ieee import *
|
||||
from .jstor import *
|
||||
from .rsc import *
|
||||
|
||||
from .sciencemagazine import *
|
||||
|
|
|
@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
@ -36,7 +35,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
length = obj.attrs["Length"]
|
||||
|
||||
# the watermark is never very long
|
||||
|
@ -45,7 +44,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
data = copy(obj.get_data())
|
||||
|
||||
phrase="Redistribution subject to AIP license or copyright"
|
||||
if phrase in data:
|
||||
if phrase in str(data):
|
||||
if verbose >= 2:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
||||
elif verbose >= 1:
|
||||
|
|
|
@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
@ -33,13 +32,13 @@ class IEEEXplore(Plugin):
|
|||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
#length = obj.attrs["Length"]
|
||||
#rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
phrase= "Authorized licensed use limited to: "
|
||||
if phrase in data:
|
||||
if phrase in str(data):
|
||||
if verbose >= 2:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||
elif verbose >= 1:
|
||||
|
|
|
@ -10,6 +10,8 @@ from ..eraser import (
|
|||
)
|
||||
from ..plugin import Plugin
|
||||
|
||||
from pdfminer.pdftypes import PDFObjectNotFound
|
||||
|
||||
class JSTOR(Plugin):
|
||||
"""
|
||||
JSTOR
|
||||
|
@ -44,13 +46,13 @@ class JSTOR(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
try:
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
|
@ -93,6 +95,8 @@ class JSTOR(Plugin):
|
|||
replacements.append([objid, better_content])
|
||||
|
||||
page_id += 1
|
||||
except PDFObjectNotFound as e:
|
||||
print >>sys.stderr, 'Missing object: %r' % e
|
||||
|
||||
if verbose >= 1 and replacements:
|
||||
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||
|
|
|
@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
|
49
pdfparanoia/plugins/sciencemagazine.py
Normal file
49
pdfparanoia/plugins/sciencemagazine.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class ScienceMagazine(Plugin):
|
||||
"""
|
||||
Science Magazine
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Remove ads from academic papers. :(
|
||||
"""
|
||||
|
||||
# TODO: better confirmation that the paper is from sciencemag. Look for
|
||||
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
|
||||
# that server.
|
||||
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=0):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if ("Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
|
||||
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
|
||||
evil_ids.append(objid)
|
||||
|
||||
if len(evil_ids) > 1:
|
||||
raise Exception("too many ads detected on the page, please double check?")
|
||||
|
||||
for objid in evil_ids:
|
||||
content = remove_object_by_id(content, objid)
|
||||
|
||||
return content
|
|
@ -1 +1 @@
|
|||
pdfminer>=0
|
||||
pdfminer>=20131113
|
||||
|
|
20
setup.py
20
setup.py
|
@ -1,12 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from setuptools import setup
|
||||
import os
|
||||
import platform
|
||||
|
||||
import pdfparanoia
|
||||
|
||||
long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read()
|
||||
|
||||
# pdfminer isn't cross-version compatible but a py3k port is in PyPI
|
||||
if platform.python_version() >= "3.0.0":
|
||||
dependencies = ["pdfminer3k>=1.3.0"]
|
||||
else:
|
||||
dependencies = ["pdfminer>=20131113"]
|
||||
|
||||
packages = [
|
||||
"pdfparanoia",
|
||||
"pdfparanoia.plugins",
|
||||
]
|
||||
|
||||
setup(
|
||||
name="pdfparanoia",
|
||||
version="0.0.14",
|
||||
version=pdfparanoia.__version__,
|
||||
url="https://github.com/kanzure/pdfparanoia",
|
||||
license="BSD",
|
||||
author="Bryan Bishop",
|
||||
|
@ -15,7 +29,9 @@ setup(
|
|||
maintainer_email="kanzure@gmail.com",
|
||||
description="pdf watermark remover library for academic papers",
|
||||
long_description=long_description,
|
||||
install_requires=["pdfminer>=0"],
|
||||
long_description_content_type="text/markdown",
|
||||
install_requires=dependencies,
|
||||
packages=packages,
|
||||
scripts=["bin/pdfparanoia"],
|
||||
platforms="any",
|
||||
zip_safe=False,
|
||||
|
|
|
@ -8,13 +8,13 @@ class JSTORTestCase(unittest.TestCase):
|
|||
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
file_handler.close()
|
||||
self.assertIn("\n18 0 obj \n", content)
|
||||
self.assertIn(b"\n18 0 obj \n", content)
|
||||
|
||||
# this section will later be manipulated
|
||||
self.assertIn("\n19 0 obj \n", content)
|
||||
self.assertIn(b"\n19 0 obj \n", content)
|
||||
|
||||
output = pdfparanoia.plugins.JSTOR.scrub(content)
|
||||
|
||||
# FlateDecode should be replaced with a decompressed section
|
||||
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)
|
||||
self.assertIn(b"\n19 0 obj\n<</Length 2862>>stream", output)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user