commit 13d2507d605ca418dfe8abbb63f5de69cd8a6bec Author: Julien (jvoisin) Voisin Date: Tue Mar 6 23:20:18 2018 +0100 First commit diff --git a/main.py b/main.py new file mode 100644 index 0000000..e4157e6 --- /dev/null +++ b/main.py @@ -0,0 +1,46 @@ +import sys +from shutil import copyfile +import argparse + +from src.parsers import pdf + + +def create_arg_parser(): + parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2') + parser.add_argument('files', nargs='*') + + info = parser.add_argument_group('Information') + info.add_argument('-c', '--check', action='store_true', + help='check if a file is free of harmful metadatas') + info.add_argument('-l', '--list', action='store_true', + help='list all supported fileformats') + info.add_argument('-s', '--show', action='store_true', + help='list all the harmful metadata of a file without removing them') + return parser + +def show_meta(file_name:str): + p = pdf.PDFParser(file_name) + for k,v in p.get_meta().items(): + print("%s: %s" % (k, v)) + +def main(): + argparser = create_arg_parser() + args = argparser.parse_args() + + if args.show: + for f in args.files: + show_meta(f) + return 0 + elif not args.files: + return parser.show_help() + + copyfile(sys.argv[1] + '.bak', sys.argv[1]) + p = pdf.PDFParser(sys.argv[1]) + p.remove_all() + p = pdf.PDFParser('OUT_clean.pdf') + print("ok") + + +if __name__ == '__main__': + + main() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parsers/__init__.py b/src/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py new file mode 100644 index 0000000..a9129cc --- /dev/null +++ b/src/parsers/abstract.py @@ -0,0 +1,10 @@ +class AbstractParser(object): + def __init__(self, filename: str): + self.filename = filename + self.meta_list = set() + + def get_meta(self): + raise NotImplementedError + + def remove_all(self): + raise NotImplementedError diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py new file mode 100644 index 0000000..c25b324 --- /dev/null +++ b/src/parsers/pdf.py @@ -0,0 +1,106 @@ +""" Handle PDF + +""" + +import os +import logging +import tempfile +import shutil +import io + +import cairo +import gi +gi.require_version('Poppler', '0.18') +from gi.repository import Poppler + +try: + from PIL import Image +except ImportError: + Image = None + +from . import abstract + +logging.basicConfig(level=logging.DEBUG) + + +class PDFParser(abstract.AbstractParser): + def __init__(self, filename): + super().__init__(filename) + self.meta_list = {'title', 'author', 'subject', + 'keywords', 'creator', 'producer', 'metadata'} + self.uri = 'file://' + os.path.abspath(self.filename) + self.password = None + + def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: + """ This is useless as fuck. """ + if Image is None: + return img + ret = io.BytesIO() + im = Image.open(img) + w, h = im.size + resized = im.resize((w, h), Image.ANTIALIAS) + resized.save(ret, optimize=True, format="PNG") + ret.seek(0) + + return ret + + + def remove_all(self): + """ + Load the document into Poppler, render pages on PNG, + and shove those PNG into a new PDF. Metadata from the new + PDF are removed via Poppler, because there is no way to tell + cairo to not add "created by cairo" during rendering. + + TODO: Improve the resolution + TODO: Don't use a temp file + """ + document = Poppler.Document.new_from_file(self.uri, self.password) + + pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) + pdf_context = cairo.Context(pdf_surface) + + for pagenum in range(document.get_n_pages()): + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) + + img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) + img_context = cairo.Context(img_surface) + + img_context.scale(2, 2) + page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) + img_context.show_page() + + buf = io.BytesIO() + img_surface.write_to_png(buf) + img_surface.finish() + buf.seek(0) + + #buf = self.__optimize_image_size(buf) + + img = cairo.ImageSurface.create_from_png(buf) + pdf_surface.set_size(page_width*2, page_height*2) + pdf_context.set_source_surface(img, 0, 0) + pdf_context.paint() + pdf_context.show_page() + + pdf_surface.finish() + + document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) + document.set_producer('totally not MAT2 ;)') + document.set_creator('') + document.save('file://' + os.path.abspath("OUT_clean.pdf")) + + return True + + def get_meta(self): + """ Return a dict with all the meta of the file + """ + print("URI: %s", self.uri) + document = Poppler.Document.new_from_file(self.uri, self.password) + metadata = {} + for key in self.meta_list: + if document.get_property(key): + metadata[key] = document.get_property(key) + return metadata diff --git a/tests/data/dirty.pdf b/tests/data/dirty.pdf new file mode 100644 index 0000000..0d88779 Binary files /dev/null and b/tests/data/dirty.pdf differ diff --git a/tests/main.py b/tests/main.py new file mode 100644 index 0000000..52828af --- /dev/null +++ b/tests/main.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 + +import unittest + +class TestCleaning(unittest.TestCase): + def test_pdf(self): + self.assertEqual('foo'.upper(), 'FOO') + + def test_isupper(self): + self.assertTrue('FOO'.isupper()) + self.assertFalse('Foo'.isupper()) + + def test_split(self): + s = 'hello world' + self.assertEqual(s.split(), ['hello', 'world']) + # check that s.split fails when the separator is not a string + with self.assertRaises(TypeError): + s.split(2) + + +if __name__ == '__main__': + unittest.main()