diff --git a/libmat2/__init__.py b/libmat2/__init__.py new file mode 100644 index 0000000..3b3dacb --- /dev/null +++ b/libmat2/__init__.py @@ -0,0 +1 @@ +__version__ = '2.0' diff --git a/libmat2/parsers/__init__.py b/libmat2/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libmat2/parsers/abstract.py b/libmat2/parsers/abstract.py new file mode 100644 index 0000000..a9129cc --- /dev/null +++ b/libmat2/parsers/abstract.py @@ -0,0 +1,10 @@ +class AbstractParser(object): + def __init__(self, filename: str): + self.filename = filename + self.meta_list = set() + + def get_meta(self): + raise NotImplementedError + + def remove_all(self): + raise NotImplementedError diff --git a/libmat2/parsers/pdf.py b/libmat2/parsers/pdf.py new file mode 100644 index 0000000..f6bc110 --- /dev/null +++ b/libmat2/parsers/pdf.py @@ -0,0 +1,105 @@ +""" Handle PDF + +""" + +import os +import logging +import tempfile +import shutil +import io + +import cairo +import gi +gi.require_version('Poppler', '0.18') +from gi.repository import Poppler, Gio, GLib + +try: + from PIL import Image +except ImportError: + Image = None + +from . import abstract + +logging.basicConfig(level=logging.DEBUG) + + +class PDFParser(abstract.AbstractParser): + def __init__(self, filename): + super().__init__(filename) + self.meta_list = {'title', 'author', 'subject', + 'keywords', 'creator', 'producer', 'metadata'} + self.uri = 'file://' + os.path.abspath(self.filename) + self.password = None + + def remove_all(self): + """ + Load the document into Poppler, render pages on PNG, + and shove those PNG into a new PDF. Metadata from the new + PDF are removed via Poppler, because there is no way to tell + cairo to not add "created by cairo" during rendering. + + TODO: Improve the resolution + TODO: Don't use a temp file + """ + document = Poppler.Document.new_from_file(self.uri, self.password) + + pdf_out = io.BytesIO() + pdf_surface = cairo.PDFSurface(pdf_out, 128, 128) + pdf_context = cairo.Context(pdf_surface) + + for pagenum in range(document.get_n_pages()): + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) + + img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) + img_context = cairo.Context(img_surface) + + img_context.scale(2, 2) + page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) + img_context.show_page() + + buf = io.BytesIO() + img_surface.write_to_png(buf) + img_surface.finish() + buf.seek(0) + + img = cairo.ImageSurface.create_from_png(buf) + pdf_surface.set_size(page_width*2, page_height*2) + pdf_context.set_source_surface(img, 0, 0) + pdf_context.paint() + pdf_context.show_page() + + pdf_surface.finish() + + b = GLib.Bytes(pdf_out.getvalue()) + input_stream = Gio.MemoryInputStream.new_from_bytes(b) + out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None) + metadata = {} + for key in self.meta_list: + if out_document.get_property(key): + metadata[key] = str(out_document.get_property(key)) + out_document.set_producer('totally not MAT2 ;)') + out_document.set_creator('') + print("AFTER") + metadata = {} + for key in self.meta_list: + if out_document.get_property(key): + metadata[key] = str(out_document.get_property(key)) + print("LOL") + out_document.save('file://' + os.path.abspath("olol.pdf")) + + print(metadata) + + return True + + def get_meta(self): + """ Return a dict with all the meta of the file + """ + print("URI: %s", self.uri) + document = Poppler.Document.new_from_file(self.uri, self.password) + metadata = {} + for key in self.meta_list: + if document.get_property(key): + metadata[key] = str(document.get_property(key)) + return metadata diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index a9129cc..d0e7108 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py @@ -1,6 +1,7 @@ class AbstractParser(object): def __init__(self, filename: str): self.filename = filename + self.output_filename = filename + '.cleaned' self.meta_list = set() def get_meta(self): diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index c25b324..a77eabd 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py @@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser): self.uri = 'file://' + os.path.abspath(self.filename) self.password = None - def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO: - """ This is useless as fuck. """ - if Image is None: - return img - ret = io.BytesIO() - im = Image.open(img) - w, h = im.size - resized = im.resize((w, h), Image.ANTIALIAS) - resized.save(ret, optimize=True, format="PNG") - ret.seek(0) - - return ret - - def remove_all(self): """ Load the document into Poppler, render pages on PNG, @@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser): """ document = Poppler.Document.new_from_file(self.uri, self.password) - pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128) + pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) pdf_context = cairo.Context(pdf_surface) for pagenum in range(document.get_n_pages()): @@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser): pdf_surface.finish() - document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) - document.set_producer('totally not MAT2 ;)') - document.set_creator('') - document.save('file://' + os.path.abspath("OUT_clean.pdf")) + # This is removing metadata + #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) + #document.set_producer('totally not MAT2 ;)') + #document.set_creator('') + #document.save('file://' + os.path.abspath("OUT_clean.pdf")) return True diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 56b960e..4751aa4 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -10,18 +10,27 @@ from src.parsers import pdf class TestGetMeta(unittest.TestCase): def test_pdf(self): p = pdf.PDFParser('./tests/data/dirty.pdf') - meta = p.get_meta().items() - + meta = p.get_meta() + self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') + self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") class TestCleaning(unittest.TestCase): def setUp(self): shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') def tearDown(self): - #os.remove('./tests/data/clean.pdf') - pass + os.remove('./tests/data/clean.pdf') def test_pdf(self): p = pdf.PDFParser('./tests/data/clean.pdf') - p.remove_all() - #self.assertEqual(p.get_meta(), {}) + + meta = p.get_meta() + self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') + + ret = p.remove_all() + self.assertTrue(ret) + + p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') + remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', + 'producer': 'cairo 1.14.10 (http://cairographics.org)'} + self.assertEqual(p.get_meta(), remaining_meta)