1
0
mirror of synced 2025-01-24 04:17:02 +01:00

Add lightweight processing for PDF

This commit is contained in:
jvoisin 2018-04-14 21:23:31 +02:00
parent 6f4ed2490f
commit 96299c6a53
5 changed files with 84 additions and 15 deletions

15
main.py
View File

@ -31,6 +31,8 @@ def create_arg_parser():
help='list all supported fileformats')
info.add_argument('-s', '--show', action='store_true',
help='list all the harmful metadata of a file without removing them')
info.add_argument('-L', '--lightweight', action='store_true',
help='remove SOME metadata')
return parser
@ -50,7 +52,7 @@ def show_meta(filename:str):
print(" %s: harmful content" % k)
def clean_meta(filename:str):
def clean_meta(filename:str, is_lightweigth:bool):
if not __check_file(filename, os.R_OK|os.W_OK):
return
@ -58,7 +60,10 @@ def clean_meta(filename:str):
if p is None:
print("[-] %s's format (%s) is not supported" % (filename, mtype))
return
p.remove_all()
if is_lightweigth:
p.remove_all_lightweight()
else:
p.remove_all()
def show_parsers():
@ -78,12 +83,12 @@ def __get_files_recursively(files):
for _f in _files:
yield os.path.join(path, _f)
def __do_clean_async(q):
def __do_clean_async(is_lightweigth, q):
while True:
f = q.get()
if f is None: # nothing more to process
return
clean_meta(f)
clean_meta(is_lightweigth, f)
q.task_done()
@ -109,7 +114,7 @@ def main():
q.put(f)
for _ in range(multiprocessing.cpu_count()):
worker = Thread(target=__do_clean_async, args=(q, ))
worker = Thread(target=__do_clean_async, args=(mode, q))
worker.start()
threads.append(worker)

View File

@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def remove_all(self) -> bool:
pass
def remove_all_lightweight(self) -> bool:
""" Remove _SOME_ metadata. """
return self.remove_all()

View File

@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render
def remove_all_lightweight(self):
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count):
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
pdf_surface.set_size(page_width, page_height)
pdf_context.save()
page.render_for_printing(pdf_context)
pdf_context.restore()
pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish()
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def remove_all(self):
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
and shove those PNG into a new PDF.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
pdf_context = cairo.Context(pdf_surface)
for pagenum in range(pages_count):
@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish()
# Removes metadata added by Poppler
document = Poppler.Document.new_from_file('file://' + tmp_path)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(self.output_filename))
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
document = Poppler.Document.new_from_file('file://' + in_file)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(out_file))
return True
def __parse_metadata_field(self, data:str) -> dict:
metadata = {}

View File

@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
def test_help(self):
proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
def test_no_arg(self):
proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
class TestGetMeta(unittest.TestCase):

View File

@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
os.remove('./tests/data/clean.odt')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.png.cleaned')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
class TestCleaning(unittest.TestCase):
def test_pdf(self):