1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-05-29 07:08:03 +02:00

Compare commits

...

26 Commits

Author SHA1 Message Date
Bryan Bishop
100d202221
further updates to Makefile 2020-09-07 09:33:54 -05:00
Bryan Bishop
c1af289767
bump version to 0.0.17 2020-09-07 09:33:08 -05:00
Bryan Bishop
26dfe093fe
start fixing some tests 2020-09-07 09:28:59 -05:00
Bryan Bishop
681af5c175
upgrade nosetests from python2.7 2020-09-07 09:25:12 -05:00
Bryan Bishop
906150e033
render markdown on pypi
fixes #54
2020-09-07 09:21:19 -05:00
Bryan Bishop
585a0ac3a4 Merge pull request #51 from ashwini0529/master
fix python3 installation error
2017-06-04 20:55:45 -05:00
Ashwini Purohit
5669e4e289 fixes python3 installation error
Fixes invalid syntax error
2017-06-05 06:30:56 +05:30
Bryan Bishop
c594ff41d6 version bump to: v0.0.16 2016-05-29 14:09:00 -05:00
Bryan Bishop
1a01757f44 README: fix typo in russian text
http://gnusha.org/logs/2016-02-25.log
2016-02-25 08:45:27 -06:00
Bryan Bishop
5cc682e2c5 Merge pull request #38 from fmap/pdfminer-api
PDFMiner made breaking interface changes
2013-12-06 15:27:42 -08:00
vi
e95374ec04 getobj can raise PDFObjectNotFound 2013-12-07 07:23:55 +08:00
vi
95a249d8ab Package: use a version of PDFMiner since the interface change (#37). 2013-12-07 07:23:47 +08:00
vi
380bc289b3 Adapt to PDFMiner's breaking interface changes (#37). 2013-12-07 07:23:34 +08:00
Bryan Bishop
713776af67 version bump to: v0.0.15 2013-09-16 15:14:27 -05:00
Bryan Bishop
e9e0ea4467 Merge pull request #34 from kanzure/fixsciencemag
Fix another syntax error in sciencemag
2013-09-16 13:12:37 -07:00
Bryan Bishop
61e67d2c4a Merge pull request #33 from kanzure/fixsetup
Fix setup.py to not have a syntax error
2013-09-16 13:12:29 -07:00
Bryan Bishop
28bf8f5825 fix another syntax error in sciencemag
How were these missed??
2013-09-16 15:11:42 -05:00
Bryan Bishop
1ff513389f wow, how did setup.py stay like that for so long? 2013-09-16 15:09:59 -05:00
Bryan Bishop
cc7d14d173 WIP of "AdBlock for Science"
The purpose of adblock for science is to remove nasty ads from papers,
which at the moment means only papers from Science Magazine as published
by the American Association for the Advancement of Science (AAAS).

I am really annoyed that I have to write an ad blocker... for science
papers.
2013-07-19 21:31:30 -05:00
Bryan Bishop
71aaf23285 io.StringIO fallback for py3k 2013-07-19 21:27:06 -05:00
Bryan Bishop
528eae7e46 minor py3k-compat changes 2013-07-19 21:26:12 -05:00
Bryan Bishop
59a71a7cd3 use io.StringIO when py3k 2013-07-19 21:25:42 -05:00
Bryan Bishop
c3e590f22f Revert "fixed self-referential package install and cleaned out __init__.py"
This reverts commit 2275565fb2.

__init__.py is needed for ./bin/pdfparanoia to work.

Conflicts:
	setup.py
2013-07-19 20:19:44 -05:00
Cathal Garvey
f3e4b74b69 Amended readme for those not using pip to install. 2013-07-14 11:10:15 +01:00
Cathal Garvey
6030778089 Made dependencies vary by version to select py3k port of pdfminer if using Py3k 2013-07-14 11:01:27 +01:00
Bryan Bishop
1070605316 use explicit imports
Mock tests will be much easier if explicit imports are used everywhere
instead of the previous format.
2013-07-09 01:47:18 -05:00
14 changed files with 166 additions and 70 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash SHELL := /bin/bash
test: test:
nosetests-2.7 -s --verbosity=2 nosetests -s --verbosity=2
clean: clean:
rm -fr build dist rm -fr build dist
@ -10,8 +10,8 @@ clean:
find . -name *.swp -exec rm {} \; find . -name *.swp -exec rm {} \;
install: install:
python2.7 setup.py install python3 setup.py install
upload: clean upload: clean
python2.7 setup.py sdist upload python3 setup.py sdist upload

View File

@ -5,7 +5,7 @@ publishers include private information like institution names, personal names,
ip addresses, timestamps and other identifying information in watermarks on ip addresses, timestamps and other identifying information in watermarks on
each page. each page.
pdfparania это библиотека для удаления водяных знаков из PDF файлов научных pdfparanoia это библиотека для удаления водяных знаков из PDF файлов научных
статей. Некоторые издатели включают личную информацию, такую как названия статей. Некоторые издатели включают личную информацию, такую как названия
институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки
содержащиеся на каждой странице. содержащиеся на каждой странице.
@ -26,6 +26,7 @@ sudo python setup.py install
pdfparanoia is written for python2.7+ or python 3. pdfparanoia is written for python2.7+ or python 3.
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia. You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
For python versions prior to Python 3, use "pdfminer" from the Python Package Index (http://pypi.python.org). For recent versions of Python, use pdfminer3k instead.
## Usage ## Usage

View File

@ -12,7 +12,11 @@ if __name__ == "__main__":
import sys import sys
import pdfparanoia import pdfparanoia
import argparse import argparse
from StringIO import StringIO
try:
from StringIO import StringIO
except ImportError:
from io import StringIO, BytesIO
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.") ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'), ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),

View File

@ -1,2 +1,29 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
pdfparanoia - pdf watermark remover library for academic papers
~~~~~~~~~~~~~~~
pdfparanoia is a pdf watermark remover library for academic papers. Basic
usage:
>>> import pdfparanoia
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
>>> file_handler = open("output.pdf", "w")
>>> file_handler.write(pdf)
>>> file_handler.close()
:copyright: (c) 2013 by Bryan Bishop.
:license: BSD.
"""
__title__ = "pdfparanoia"
__version__ = "0.0.17"
__build__ = 0x000017
__author__ = "Bryan Bishop <kanzure@gmail.com>"
__license__ = "BSD"
__copyright__ = "Copyright 2013 Bryan Bishop"
from . import utils
from .core import scrub
from .parser import deflate

View File

@ -7,15 +7,16 @@ Deals with the existential nature of parsing pdfs.
""" """
from StringIO import StringIO try:
from StringIO import StringIO
except ImportError: # py3k
from io import StringIO, BytesIO
# Maybe one day pdfquery will be able to save pdf. # Maybe one day pdfquery will be able to save pdf.
# from pdfquery import PDFQuery # from pdfquery import PDFQuery
from pdfminer.pdfparser import ( import pdfminer.pdfparser
PDFParser, import pdfminer.pdfdocument
PDFDocument,
)
from .eraser import replace_object_with from .eraser import replace_object_with
@ -27,10 +28,8 @@ def parse_pdf(handler):
handler.seek(0) handler.seek(0)
# setup for parsing # setup for parsing
parser = PDFParser(handler) parser = pdfminer.pdfparser.PDFParser(handler)
doc = PDFDocument() doc = pdfminer.pdfdocument.PDFDocument(parser)
parser.set_document(doc)
doc.set_parser(parser)
# actual parsing # actual parsing
doc.initialize() doc.initialize()
@ -58,8 +57,7 @@ def deflate(content):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# store new replacements # store new replacements

View File

@ -11,4 +11,4 @@ from .aip import *
from .ieee import * from .ieee import *
from .jstor import * from .jstor import *
from .rsc import * from .rsc import *
from .sciencemagazine import *

View File

@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf
@ -36,7 +35,7 @@ class AmericanInstituteOfPhysics(Plugin):
if hasattr(obj, "attrs"): if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements # watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
length = obj.attrs["Length"] length = obj.attrs["Length"]
# the watermark is never very long # the watermark is never very long
@ -45,7 +44,7 @@ class AmericanInstituteOfPhysics(Plugin):
data = copy(obj.get_data()) data = copy(obj.get_data())
phrase="Redistribution subject to AIP license or copyright" phrase="Redistribution subject to AIP license or copyright"
if phrase in data: if phrase in str(data):
if verbose >= 2: if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data)) sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
elif verbose >= 1: elif verbose >= 1:

View File

@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf
@ -33,13 +32,13 @@ class IEEEXplore(Plugin):
if hasattr(obj, "attrs"): if hasattr(obj, "attrs"):
# watermarks tend to be in FlateDecode elements # watermarks tend to be in FlateDecode elements
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
#length = obj.attrs["Length"] #length = obj.attrs["Length"]
#rawdata = copy(obj.rawdata) #rawdata = copy(obj.rawdata)
data = copy(obj.get_data()) data = copy(obj.get_data())
phrase= "Authorized licensed use limited to: " phrase= "Authorized licensed use limited to: "
if phrase in data: if phrase in str(data):
if verbose >= 2: if verbose >= 2:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
elif verbose >= 1: elif verbose >= 1:

View File

@ -10,6 +10,8 @@ from ..eraser import (
) )
from ..plugin import Plugin from ..plugin import Plugin
from pdfminer.pdftypes import PDFObjectNotFound
class JSTOR(Plugin): class JSTOR(Plugin):
""" """
JSTOR JSTOR
@ -44,55 +46,57 @@ class JSTOR(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf
for objid in objids: for objid in objids:
# get an object by id # get an object by id
obj = pdf.getobj(objid) try:
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"): if hasattr(obj, "attrs"):
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode": if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
data = copy(obj.get_data()) data = copy(obj.get_data())
# make sure all of the requirements are in there # make sure all of the requirements are in there
if all([requirement in data for requirement in JSTOR.requirements]): if all([requirement in data for requirement in JSTOR.requirements]):
better_content = data better_content = data
# remove the date
startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos]
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content.replace(segment, "")
# it looks like all of the watermarks are at the end?
better_content = better_content[:-160]
# "Accessed on dd/mm/yyy hh:mm"
#
# the "Accessed" line is only on the first page
#
# it's based on /F2
#
# This would be better if it could be decoded to
# actually search for the "Accessed" text.
if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5)
# remove the date
startpos = better_content.find("This content downloaded ")
endpos = better_content.find(")", startpos)
segment = better_content[startpos:endpos]
if verbose >= 2 and replacements: if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos])) sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
better_content = better_content[0:startpos] + better_content[endpos:] better_content = better_content.replace(segment, "")
replacements.append([objid, better_content]) # it looks like all of the watermarks are at the end?
better_content = better_content[:-160]
page_id += 1 # "Accessed on dd/mm/yyy hh:mm"
#
# the "Accessed" line is only on the first page
#
# it's based on /F2
#
# This would be better if it could be decoded to
# actually search for the "Accessed" text.
if page_id == 0 and "/F2 11 Tf\n" in better_content:
startpos = better_content.rfind("/F2 11 Tf\n")
endpos = better_content.find("Tf\n", startpos+5)
if verbose >= 2 and replacements:
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
better_content = better_content[0:startpos] + better_content[endpos:]
replacements.append([objid, better_content])
page_id += 1
except PDFObjectNotFound as e:
print >>sys.stderr, 'Missing object: %r' % e
if verbose >= 1 and replacements: if verbose >= 1 and replacements:
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements)) sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))

View File

@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
pdf = parse_content(content) pdf = parse_content(content)
# get a list of all object ids # get a list of all object ids
xrefs = pdf._parser.read_xref() xref = pdf.xrefs[0]
xref = xrefs[0]
objids = xref.get_objids() objids = xref.get_objids()
# check each object in the pdf # check each object in the pdf

View File

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
from copy import copy
import sys
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class ScienceMagazine(Plugin):
"""
Science Magazine
~~~~~~~~~~~~~~~
Remove ads from academic papers. :(
"""
# TODO: better confirmation that the paper is from sciencemag. Look for
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
# that server.
@classmethod
def scrub(cls, content, verbose=0):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xref = pdf.xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
if ("Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
evil_ids.append(objid)
if len(evil_ids) > 1:
raise Exception("too many ads detected on the page, please double check?")
for objid in evil_ids:
content = remove_object_by_id(content, objid)
return content

View File

@ -1 +1 @@
pdfminer>=0 pdfminer>=20131113

View File

@ -1,12 +1,26 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from setuptools import setup from setuptools import setup
import os import os
import platform
import pdfparanoia
long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read() long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read()
# pdfminer isn't cross-version compatible but a py3k port is in PyPI
if platform.python_version() >= "3.0.0":
dependencies = ["pdfminer3k>=1.3.0"]
else:
dependencies = ["pdfminer>=20131113"]
packages = [
"pdfparanoia",
"pdfparanoia.plugins",
]
setup( setup(
name="pdfparanoia", name="pdfparanoia",
version="0.0.14", version=pdfparanoia.__version__,
url="https://github.com/kanzure/pdfparanoia", url="https://github.com/kanzure/pdfparanoia",
license="BSD", license="BSD",
author="Bryan Bishop", author="Bryan Bishop",
@ -15,7 +29,9 @@ setup(
maintainer_email="kanzure@gmail.com", maintainer_email="kanzure@gmail.com",
description="pdf watermark remover library for academic papers", description="pdf watermark remover library for academic papers",
long_description=long_description, long_description=long_description,
install_requires=["pdfminer>=0"], long_description_content_type="text/markdown",
install_requires=dependencies,
packages=packages,
scripts=["bin/pdfparanoia"], scripts=["bin/pdfparanoia"],
platforms="any", platforms="any",
zip_safe=False, zip_safe=False,

View File

@ -8,13 +8,13 @@ class JSTORTestCase(unittest.TestCase):
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb") file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
content = file_handler.read() content = file_handler.read()
file_handler.close() file_handler.close()
self.assertIn("\n18 0 obj \n", content) self.assertIn(b"\n18 0 obj \n", content)
# this section will later be manipulated # this section will later be manipulated
self.assertIn("\n19 0 obj \n", content) self.assertIn(b"\n19 0 obj \n", content)
output = pdfparanoia.plugins.JSTOR.scrub(content) output = pdfparanoia.plugins.JSTOR.scrub(content)
# FlateDecode should be replaced with a decompressed section # FlateDecode should be replaced with a decompressed section
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output) self.assertIn(b"\n19 0 obj\n<</Length 2862>>stream", output)