Compare commits
26 Commits
Author | SHA1 | Date |
---|---|---|
Bryan Bishop | 100d202221 | |
Bryan Bishop | c1af289767 | |
Bryan Bishop | 26dfe093fe | |
Bryan Bishop | 681af5c175 | |
Bryan Bishop | 906150e033 | |
Bryan Bishop | 585a0ac3a4 | |
Ashwini Purohit | 5669e4e289 | |
Bryan Bishop | c594ff41d6 | |
Bryan Bishop | 1a01757f44 | |
Bryan Bishop | 5cc682e2c5 | |
vi | e95374ec04 | |
vi | 95a249d8ab | |
vi | 380bc289b3 | |
Bryan Bishop | 713776af67 | |
Bryan Bishop | e9e0ea4467 | |
Bryan Bishop | 61e67d2c4a | |
Bryan Bishop | 28bf8f5825 | |
Bryan Bishop | 1ff513389f | |
Bryan Bishop | cc7d14d173 | |
Bryan Bishop | 71aaf23285 | |
Bryan Bishop | 528eae7e46 | |
Bryan Bishop | 59a71a7cd3 | |
Bryan Bishop | c3e590f22f | |
Cathal Garvey | f3e4b74b69 | |
Cathal Garvey | 6030778089 | |
Bryan Bishop | 1070605316 |
6
Makefile
6
Makefile
|
@ -1,7 +1,7 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
test:
|
||||
nosetests-2.7 -s --verbosity=2
|
||||
nosetests -s --verbosity=2
|
||||
|
||||
clean:
|
||||
rm -fr build dist
|
||||
|
@ -10,8 +10,8 @@ clean:
|
|||
find . -name *.swp -exec rm {} \;
|
||||
|
||||
install:
|
||||
python2.7 setup.py install
|
||||
python3 setup.py install
|
||||
|
||||
upload: clean
|
||||
python2.7 setup.py sdist upload
|
||||
python3 setup.py sdist upload
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ publishers include private information like institution names, personal names,
|
|||
ip addresses, timestamps and other identifying information in watermarks on
|
||||
each page.
|
||||
|
||||
pdfparania это библиотека для удаления водяных знаков из PDF файлов научных
|
||||
pdfparanoia это библиотека для удаления водяных знаков из PDF файлов научных
|
||||
статей. Некоторые издатели включают личную информацию, такую как названия
|
||||
институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки
|
||||
содержащиеся на каждой странице.
|
||||
|
@ -26,6 +26,7 @@ sudo python setup.py install
|
|||
|
||||
pdfparanoia is written for python2.7+ or python 3.
|
||||
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
|
||||
For python versions prior to Python 3, use "pdfminer" from the Python Package Index (http://pypi.python.org). For recent versions of Python, use pdfminer3k instead.
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
|
@ -12,7 +12,11 @@ if __name__ == "__main__":
|
|||
import sys
|
||||
import pdfparanoia
|
||||
import argparse
|
||||
from StringIO import StringIO
|
||||
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO, BytesIO
|
||||
|
||||
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
|
||||
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),
|
||||
|
|
|
@ -1,2 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
pdfparanoia - pdf watermark remover library for academic papers
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||
usage:
|
||||
|
||||
>>> import pdfparanoia
|
||||
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
|
||||
>>> file_handler = open("output.pdf", "w")
|
||||
>>> file_handler.write(pdf)
|
||||
>>> file_handler.close()
|
||||
|
||||
:copyright: (c) 2013 by Bryan Bishop.
|
||||
:license: BSD.
|
||||
"""
|
||||
|
||||
__title__ = "pdfparanoia"
|
||||
__version__ = "0.0.17"
|
||||
__build__ = 0x000017
|
||||
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||
__license__ = "BSD"
|
||||
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||
|
||||
from . import utils
|
||||
from .core import scrub
|
||||
from .parser import deflate
|
||||
|
||||
|
|
|
@ -7,15 +7,16 @@ Deals with the existential nature of parsing pdfs.
|
|||
|
||||
"""
|
||||
|
||||
from StringIO import StringIO
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError: # py3k
|
||||
from io import StringIO, BytesIO
|
||||
|
||||
# Maybe one day pdfquery will be able to save pdf.
|
||||
# from pdfquery import PDFQuery
|
||||
|
||||
from pdfminer.pdfparser import (
|
||||
PDFParser,
|
||||
PDFDocument,
|
||||
)
|
||||
import pdfminer.pdfparser
|
||||
import pdfminer.pdfdocument
|
||||
|
||||
from .eraser import replace_object_with
|
||||
|
||||
|
@ -27,10 +28,8 @@ def parse_pdf(handler):
|
|||
handler.seek(0)
|
||||
|
||||
# setup for parsing
|
||||
parser = PDFParser(handler)
|
||||
doc = PDFDocument()
|
||||
parser.set_document(doc)
|
||||
doc.set_parser(parser)
|
||||
parser = pdfminer.pdfparser.PDFParser(handler)
|
||||
doc = pdfminer.pdfdocument.PDFDocument(parser)
|
||||
|
||||
# actual parsing
|
||||
doc.initialize()
|
||||
|
@ -58,8 +57,7 @@ def deflate(content):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# store new replacements
|
||||
|
|
|
@ -11,4 +11,4 @@ from .aip import *
|
|||
from .ieee import *
|
||||
from .jstor import *
|
||||
from .rsc import *
|
||||
|
||||
from .sciencemagazine import *
|
||||
|
|
|
@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
@ -36,7 +35,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
length = obj.attrs["Length"]
|
||||
|
||||
# the watermark is never very long
|
||||
|
@ -45,7 +44,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
|||
data = copy(obj.get_data())
|
||||
|
||||
phrase="Redistribution subject to AIP license or copyright"
|
||||
if phrase in data:
|
||||
if phrase in str(data):
|
||||
if verbose >= 2:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
||||
elif verbose >= 1:
|
||||
|
|
|
@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
@ -33,13 +32,13 @@ class IEEEXplore(Plugin):
|
|||
|
||||
if hasattr(obj, "attrs"):
|
||||
# watermarks tend to be in FlateDecode elements
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
#length = obj.attrs["Length"]
|
||||
#rawdata = copy(obj.rawdata)
|
||||
data = copy(obj.get_data())
|
||||
|
||||
phrase= "Authorized licensed use limited to: "
|
||||
if phrase in data:
|
||||
if phrase in str(data):
|
||||
if verbose >= 2:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||
elif verbose >= 1:
|
||||
|
|
|
@ -10,6 +10,8 @@ from ..eraser import (
|
|||
)
|
||||
from ..plugin import Plugin
|
||||
|
||||
from pdfminer.pdftypes import PDFObjectNotFound
|
||||
|
||||
class JSTOR(Plugin):
|
||||
"""
|
||||
JSTOR
|
||||
|
@ -44,55 +46,57 @@ class JSTOR(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
try:
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
data = copy(obj.get_data())
|
||||
if hasattr(obj, "attrs"):
|
||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||
data = copy(obj.get_data())
|
||||
|
||||
# make sure all of the requirements are in there
|
||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||
better_content = data
|
||||
|
||||
# remove the date
|
||||
startpos = better_content.find("This content downloaded ")
|
||||
endpos = better_content.find(")", startpos)
|
||||
segment = better_content[startpos:endpos]
|
||||
if verbose >= 2 and replacements:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||
|
||||
better_content = better_content.replace(segment, "")
|
||||
|
||||
# it looks like all of the watermarks are at the end?
|
||||
better_content = better_content[:-160]
|
||||
|
||||
# "Accessed on dd/mm/yyy hh:mm"
|
||||
#
|
||||
# the "Accessed" line is only on the first page
|
||||
#
|
||||
# it's based on /F2
|
||||
#
|
||||
# This would be better if it could be decoded to
|
||||
# actually search for the "Accessed" text.
|
||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||
endpos = better_content.find("Tf\n", startpos+5)
|
||||
# make sure all of the requirements are in there
|
||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||
better_content = data
|
||||
|
||||
# remove the date
|
||||
startpos = better_content.find("This content downloaded ")
|
||||
endpos = better_content.find(")", startpos)
|
||||
segment = better_content[startpos:endpos]
|
||||
if verbose >= 2 and replacements:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||
|
||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||
better_content = better_content.replace(segment, "")
|
||||
|
||||
replacements.append([objid, better_content])
|
||||
# it looks like all of the watermarks are at the end?
|
||||
better_content = better_content[:-160]
|
||||
|
||||
page_id += 1
|
||||
# "Accessed on dd/mm/yyy hh:mm"
|
||||
#
|
||||
# the "Accessed" line is only on the first page
|
||||
#
|
||||
# it's based on /F2
|
||||
#
|
||||
# This would be better if it could be decoded to
|
||||
# actually search for the "Accessed" text.
|
||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||
endpos = better_content.find("Tf\n", startpos+5)
|
||||
|
||||
if verbose >= 2 and replacements:
|
||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||
|
||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||
|
||||
replacements.append([objid, better_content])
|
||||
|
||||
page_id += 1
|
||||
except PDFObjectNotFound as e:
|
||||
print >>sys.stderr, 'Missing object: %r' % e
|
||||
|
||||
if verbose >= 1 and replacements:
|
||||
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||
|
|
|
@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
|
|||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class ScienceMagazine(Plugin):
|
||||
"""
|
||||
Science Magazine
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Remove ads from academic papers. :(
|
||||
"""
|
||||
|
||||
# TODO: better confirmation that the paper is from sciencemag. Look for
|
||||
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
|
||||
# that server.
|
||||
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=0):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xref = pdf.xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if ("Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
|
||||
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
|
||||
evil_ids.append(objid)
|
||||
|
||||
if len(evil_ids) > 1:
|
||||
raise Exception("too many ads detected on the page, please double check?")
|
||||
|
||||
for objid in evil_ids:
|
||||
content = remove_object_by_id(content, objid)
|
||||
|
||||
return content
|
|
@ -1 +1 @@
|
|||
pdfminer>=0
|
||||
pdfminer>=20131113
|
||||
|
|
20
setup.py
20
setup.py
|
@ -1,12 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from setuptools import setup
|
||||
import os
|
||||
import platform
|
||||
|
||||
import pdfparanoia
|
||||
|
||||
long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read()
|
||||
|
||||
# pdfminer isn't cross-version compatible but a py3k port is in PyPI
|
||||
if platform.python_version() >= "3.0.0":
|
||||
dependencies = ["pdfminer3k>=1.3.0"]
|
||||
else:
|
||||
dependencies = ["pdfminer>=20131113"]
|
||||
|
||||
packages = [
|
||||
"pdfparanoia",
|
||||
"pdfparanoia.plugins",
|
||||
]
|
||||
|
||||
setup(
|
||||
name="pdfparanoia",
|
||||
version="0.0.14",
|
||||
version=pdfparanoia.__version__,
|
||||
url="https://github.com/kanzure/pdfparanoia",
|
||||
license="BSD",
|
||||
author="Bryan Bishop",
|
||||
|
@ -15,7 +29,9 @@ setup(
|
|||
maintainer_email="kanzure@gmail.com",
|
||||
description="pdf watermark remover library for academic papers",
|
||||
long_description=long_description,
|
||||
install_requires=["pdfminer>=0"],
|
||||
long_description_content_type="text/markdown",
|
||||
install_requires=dependencies,
|
||||
packages=packages,
|
||||
scripts=["bin/pdfparanoia"],
|
||||
platforms="any",
|
||||
zip_safe=False,
|
||||
|
|
|
@ -8,13 +8,13 @@ class JSTORTestCase(unittest.TestCase):
|
|||
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
|
||||
content = file_handler.read()
|
||||
file_handler.close()
|
||||
self.assertIn("\n18 0 obj \n", content)
|
||||
self.assertIn(b"\n18 0 obj \n", content)
|
||||
|
||||
# this section will later be manipulated
|
||||
self.assertIn("\n19 0 obj \n", content)
|
||||
self.assertIn(b"\n19 0 obj \n", content)
|
||||
|
||||
output = pdfparanoia.plugins.JSTOR.scrub(content)
|
||||
|
||||
# FlateDecode should be replaced with a decompressed section
|
||||
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)
|
||||
self.assertIn(b"\n19 0 obj\n<</Length 2862>>stream", output)
|
||||
|
||||
|
|
Loading…
Reference in New Issue