First commit

2025-07-01 02:57:53 +02:00 · 2018-03-06 23:20:18 +01:00 · 2018-03-06 23:20:18 +01:00 · 13d2507d60
commit 13d2507d60
7 changed files with 184 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,46 @@
 import sys
 from shutil import copyfile
 import argparse
 from src.parsers import pdf
 def create_arg_parser():
    parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
    parser.add_argument('files', nargs='*')
    info = parser.add_argument_group('Information')
    info.add_argument('-c', '--check', action='store_true',
                      help='check if a file is free of harmful metadatas')
    info.add_argument('-l', '--list', action='store_true',
                      help='list all supported fileformats')
    info.add_argument('-s', '--show', action='store_true',
                      help='list all the harmful metadata of a file without removing them')
    return parser
 def show_meta(file_name:str):
    p = pdf.PDFParser(file_name)
    for k,v in p.get_meta().items():
        print("%s: %s" % (k, v))
 def main():
    argparser = create_arg_parser()
    args = argparser.parse_args()
    if args.show:
        for f in args.files:
            show_meta(f)
        return 0
    elif not args.files:
        return parser.show_help()
    copyfile(sys.argv[1] + '.bak', sys.argv[1])
    p = pdf.PDFParser(sys.argv[1])
    p.remove_all()
    p = pdf.PDFParser('OUT_clean.pdf')
    print("ok")
 if __name__ == '__main__':
    main()
--- a/src/init.py
+++ b/src/init.py
--- a/src/parsers/init.py
+++ b/src/parsers/init.py
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@ -0,0 +1,10 @@
 class AbstractParser(object):
    def __init__(self, filename: str):
        self.filename = filename
        self.meta_list = set()
    def get_meta(self):
        raise NotImplementedError
    def remove_all(self):
        raise NotImplementedError
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@ -0,0 +1,106 @@
 """ Handle PDF
 """
 import os
 import logging
 import tempfile
 import shutil
 import io
 import cairo
 import gi
 gi.require_version('Poppler', '0.18')
 from gi.repository import Poppler
 try:
    from PIL import Image
 except ImportError:
    Image = None
 from . import abstract
 logging.basicConfig(level=logging.DEBUG)
 class PDFParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        self.meta_list = {'title', 'author', 'subject',
            'keywords', 'creator', 'producer', 'metadata'}
        self.uri = 'file://' + os.path.abspath(self.filename)
        self.password = None
    def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
        """ This is useless as fuck. """
        if Image is None:
            return img
        ret = io.BytesIO()
        im = Image.open(img)
        w, h = im.size
        resized = im.resize((w, h), Image.ANTIALIAS)
        resized.save(ret, optimize=True, format="PNG")
        ret.seek(0)
        return ret
    def remove_all(self):
        """
            Load the document into Poppler, render pages on PNG,
            and shove those PNG into a new PDF. Metadata from the new
            PDF are removed via Poppler, because there is no way to tell
            cairo to not add "created by cairo" during rendering.
            TODO: Improve the resolution
            TODO: Don't use a temp file
        """
        document = Poppler.Document.new_from_file(self.uri, self.password)
        pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128)
        pdf_context = cairo.Context(pdf_surface)
        for pagenum in range(document.get_n_pages()):
            page = document.get_page(pagenum)
            page_width, page_height = page.get_size()
            logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
            img_context = cairo.Context(img_surface)
            img_context.scale(2, 2)
            page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
            img_context.show_page()
            buf = io.BytesIO()
            img_surface.write_to_png(buf)
            img_surface.finish()
            buf.seek(0)
            #buf = self.__optimize_image_size(buf)
            img = cairo.ImageSurface.create_from_png(buf)
            pdf_surface.set_size(page_width*2, page_height*2)
            pdf_context.set_source_surface(img, 0, 0)
            pdf_context.paint()
            pdf_context.show_page()
        pdf_surface.finish()
        document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
        document.set_producer('totally not MAT2 ;)')
        document.set_creator('')
        document.save('file://' + os.path.abspath("OUT_clean.pdf"))
        return True
    def get_meta(self):
        """ Return a dict with all the meta of the file
        """
        print("URI: %s", self.uri)
        document = Poppler.Document.new_from_file(self.uri, self.password)
        metadata = {}
        for key in self.meta_list:
            if document.get_property(key):
                metadata[key] = document.get_property(key)
        return metadata
--- a/tests/data/dirty.pdf
+++ b/tests/data/dirty.pdf
--- a/tests/main.py
+++ b/tests/main.py
@ -0,0 +1,22 @@
 #!/usr/bin/python3
 import unittest
 class TestCleaning(unittest.TestCase):
    def test_pdf(self):
        self.assertEqual('foo'.upper(), 'FOO')
    def test_isupper(self):
        self.assertTrue('FOO'.isupper())
        self.assertFalse('Foo'.isupper())
    def test_split(self):
        s = 'hello world'
        self.assertEqual(s.split(), ['hello', 'world'])
        # check that s.split fails when the separator is not a string
        with self.assertRaises(TypeError):
            s.split(2)
 if __name__ == '__main__':
    unittest.main()