diff --git a/README.md b/README.md index f30855b..96819d7 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,9 @@ or, sudo python setup.py install ``` +pdfparanoia is written for python2.7+ or python 3. +You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia. + ## Usage ``` python @@ -23,15 +26,14 @@ import pdfparanoia pdf = pdfparanoia.scrub(open("nmat91417.pdf", "rb")) -file_handler = open("output.pdf", "wb") -file_handler.write(pdf) -file_handler.close() +with open("output.pdf", "wb") as file_handler: + file_handler.write(pdf) ``` or from the shell, ``` bash -pdfparanoia --verbose input.pdf > output.pdf +pdfparanoia --verbose input.pdf -o output.pdf ``` and, diff --git a/bin/pdfparanoia b/bin/pdfparanoia index 749fa96..389ee03 100755 --- a/bin/pdfparanoia +++ b/bin/pdfparanoia @@ -10,28 +10,29 @@ stdin/piping or by referencing a file in argv[0]. if __name__ == "__main__": import sys - import fileinput + import pdfparanoia + import argparse from StringIO import StringIO + ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.") + ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'), + default='-') # argparse.FileType interprets "-" as Stdin. + ArgP.add_argument("-o", "--output", type=argparse.FileType('wb'), + default=sys.stdout) + ArgP.add_argument("-v", "--verbose", action="store_true", default=False, + help="Output more information, which may be sensitive or excessive.") + ArgP.add_argument("-V", "--more-verbose", action="store_true", default=False, + help="Output even more information. Implies -v.") + Args = ArgP.parse_args() + verbose = 0 - while '--verbose' in sys.argv: - verbose += 1 - sys.argv.pop(sys.argv.index('--verbose')) - - while '-v' in sys.argv: - verbose += 1 - sys.argv.pop(sys.argv.index('-v')) - - import pdfparanoia - - # read in all lines - content = "" - for line in fileinput.input(): - content += line - - # scrub the pdf to get rid of watermarks - output = pdfparanoia.scrub(StringIO(content), verbose=verbose) - - # dump to output - sys.stdout.write(output) + if Args.verbose: verbose = 1 + if Args.more_verbose: verbose = 2 + # I really don't like having to read a file only to cast as StringIO, but seems + # necessary to get reading from StdIn to play nicely with pdfparanoia. + outputcontent = pdfparanoia.scrub(StringIO(Args.in_pdf.read()), verbose=verbose) + Args.in_pdf.close() + Args.output.write(outputcontent) + if Args.output != sys.stdout: + Args.output.close() diff --git a/setup.py b/setup.py index 564b910..6f07b18 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ setup( "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python", - #"Programming Language :: Python :: 2.6", + # Uses argparse and with statement; 2.7+ "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.1",