1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-06-10 12:49:52 +02:00
This commit is contained in:
Cathal Garvey 2013-03-21 16:57:33 -07:00
commit c1c195e965
3 changed files with 29 additions and 26 deletions

View File

@ -16,6 +16,9 @@ or,
sudo python setup.py install sudo python setup.py install
``` ```
pdfparanoia is written for python2.7+ or python 3.
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
## Usage ## Usage
``` python ``` python
@ -23,15 +26,14 @@ import pdfparanoia
pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb")) pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb"))
file_handler = open("output.pdf", "wb") with open("output.pdf", "wb") as file_handler:
file_handler.write(pdf) file_handler.write(pdf)
file_handler.close()
``` ```
or from the shell, or from the shell,
``` bash ``` bash
pdfparanoia --verbose input.pdf > output.pdf pdfparanoia --verbose input.pdf -o output.pdf
``` ```
and, and,

View File

@ -10,28 +10,29 @@ stdin/piping or by referencing a file in argv[0].
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
import fileinput import pdfparanoia
import argparse
from StringIO import StringIO from StringIO import StringIO
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),
default='-') # argparse.FileType interprets "-" as Stdin.
ArgP.add_argument("-o", "--output", type=argparse.FileType('wb'),
default=sys.stdout)
ArgP.add_argument("-v", "--verbose", action="store_true", default=False,
help="Output more information, which may be sensitive or excessive.")
ArgP.add_argument("-V", "--more-verbose", action="store_true", default=False,
help="Output even more information. Implies -v.")
Args = ArgP.parse_args()
verbose = 0 verbose = 0
while '--verbose' in sys.argv: if Args.verbose: verbose = 1
verbose += 1 if Args.more_verbose: verbose = 2
sys.argv.pop(sys.argv.index('--verbose'))
while '-v' in sys.argv:
verbose += 1
sys.argv.pop(sys.argv.index('-v'))
import pdfparanoia
# read in all lines
content = ""
for line in fileinput.input():
content += line
# scrub the pdf to get rid of watermarks
output = pdfparanoia.scrub(StringIO(content), verbose=verbose)
# dump to output
sys.stdout.write(output)
# I really don't like having to read a file only to cast as StringIO, but seems
# necessary to get reading from StdIn to play nicely with pdfparanoia.
outputcontent = pdfparanoia.scrub(StringIO(Args.in_pdf.read()), verbose=verbose)
Args.in_pdf.close()
Args.output.write(outputcontent)
if Args.output != sys.stdout:
Args.output.close()

View File

@ -31,7 +31,7 @@ setup(
"License :: OSI Approved :: BSD License", "License :: OSI Approved :: BSD License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
"Programming Language :: Python", "Programming Language :: Python",
#"Programming Language :: Python :: 2.6", # Uses argparse and with statement; 2.7+
"Programming Language :: Python :: 2.7", "Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.1", "Programming Language :: Python :: 3.1",