1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Add lightweight processing for PDF

This commit is contained in:
jvoisin 2018-04-14 21:23:31 +02:00
parent 6f4ed2490f
commit 96299c6a53
5 changed files with 84 additions and 15 deletions

15
main.py
View File

@ -31,6 +31,8 @@ def create_arg_parser():
help='list all supported fileformats') help='list all supported fileformats')
info.add_argument('-s', '--show', action='store_true', info.add_argument('-s', '--show', action='store_true',
help='list all the harmful metadata of a file without removing them') help='list all the harmful metadata of a file without removing them')
info.add_argument('-L', '--lightweight', action='store_true',
help='remove SOME metadata')
return parser return parser
@ -50,7 +52,7 @@ def show_meta(filename:str):
print(" %s: harmful content" % k) print(" %s: harmful content" % k)
def clean_meta(filename:str): def clean_meta(filename:str, is_lightweigth:bool):
if not __check_file(filename, os.R_OK|os.W_OK): if not __check_file(filename, os.R_OK|os.W_OK):
return return
@ -58,7 +60,10 @@ def clean_meta(filename:str):
if p is None: if p is None:
print("[-] %s's format (%s) is not supported" % (filename, mtype)) print("[-] %s's format (%s) is not supported" % (filename, mtype))
return return
p.remove_all() if is_lightweigth:
p.remove_all_lightweight()
else:
p.remove_all()
def show_parsers(): def show_parsers():
@ -78,12 +83,12 @@ def __get_files_recursively(files):
for _f in _files: for _f in _files:
yield os.path.join(path, _f) yield os.path.join(path, _f)
def __do_clean_async(q): def __do_clean_async(is_lightweigth, q):
while True: while True:
f = q.get() f = q.get()
if f is None: # nothing more to process if f is None: # nothing more to process
return return
clean_meta(f) clean_meta(is_lightweigth, f)
q.task_done() q.task_done()
@ -109,7 +114,7 @@ def main():
q.put(f) q.put(f)
for _ in range(multiprocessing.cpu_count()): for _ in range(multiprocessing.cpu_count()):
worker = Thread(target=__do_clean_async, args=(q, )) worker = Thread(target=__do_clean_async, args=(mode, q))
worker.start() worker.start()
threads.append(worker) threads.append(worker)

View File

@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def remove_all(self) -> bool: def remove_all(self) -> bool:
pass pass
def remove_all_lightweight(self) -> bool:
""" Remove _SOME_ metadata. """
return self.remove_all()

View File

@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render self.__scale = 2 # how much precision do we want for the render
def remove_all_lightweight(self):
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count):
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
pdf_surface.set_size(page_width, page_height)
pdf_context.save()
page.render_for_printing(pdf_context)
pdf_context.restore()
pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish()
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def remove_all(self): def remove_all(self):
""" """
Load the document into Poppler, render pages on PNG, Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new and shove those PNG into a new PDF.
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
""" """
document = Poppler.Document.new_from_file(self.uri, None) document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages() pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp() _, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
pdf_context = cairo.Context(pdf_surface) pdf_context = cairo.Context(pdf_surface)
for pagenum in range(pages_count): for pagenum in range(pages_count):
@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish() pdf_surface.finish()
# Removes metadata added by Poppler # Removes metadata added by Poppler
document = Poppler.Document.new_from_file('file://' + tmp_path) self.__remove_superficial_meta(tmp_path, self.output_filename)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(self.output_filename))
os.remove(tmp_path) os.remove(tmp_path)
return True return True
def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
document = Poppler.Document.new_from_file('file://' + in_file)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(out_file))
return True
def __parse_metadata_field(self, data:str) -> dict: def __parse_metadata_field(self, data:str) -> dict:
metadata = {} metadata = {}

View File

@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
def test_help(self): def test_help(self):
proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE) proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate() stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
def test_no_arg(self): def test_no_arg(self):
proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE) proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate() stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
class TestGetMeta(unittest.TestCase): class TestGetMeta(unittest.TestCase):

View File

@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.odt')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.png.cleaned')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
class TestCleaning(unittest.TestCase): class TestCleaning(unittest.TestCase):
def test_pdf(self): def test_pdf(self):