mat2/libmat2/pdf.py

""" Handle PDF

"""

import os
import re
import logging
import tempfile
import io
from typing import Union, Dict

import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler, GLib

from . import abstract

FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5


class PDFParser(abstract.AbstractParser):
    mimetypes = {'application/pdf', }
    meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
                 'metadata', 'mod-date', 'producer', 'subject', 'title',
                 'viewer-preferences'}

    def __init__(self, filename):
        super().__init__(filename)
        self.uri = 'file://' + os.path.abspath(self.filename)
        self.__scale = 200 / 72.0  # how much precision do we want for the render
        try:  # Check now that the file is valid, to avoid surprises later
            Poppler.Document.new_from_file(self.uri, None)
        except GLib.GError:  # Invalid PDF
            raise ValueError

    def remove_all(self) -> bool:
        if self.lightweight_cleaning is True:
            return self.__remove_all_lightweight()
        return self.__remove_all_thorough()

    def __remove_all_lightweight(self) -> bool:
        """
            Load the document into Poppler, render pages on a new PDFSurface.
        """
        document = Poppler.Document.new_from_file(self.uri, None)
        pages_count = document.get_n_pages()

        tmp_path = tempfile.mkstemp()[1]
        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface

        for pagenum in range(pages_count):
            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
            page = document.get_page(pagenum)
            page_width, page_height = page.get_size()
            pdf_surface.set_size(page_width, page_height)
            pdf_context.save()
            page.render_for_printing(pdf_context)
            pdf_context.restore()
            pdf_context.show_page()  # draw pdf_context on pdf_surface
        pdf_surface.finish()

        self.__remove_superficial_meta(tmp_path, self.output_filename)
        os.remove(tmp_path)

        return True

    def __remove_all_thorough(self) -> bool:
        """
            Load the document into Poppler, render pages on PNG,
            and shove those PNG into a new PDF.
        """
        document = Poppler.Document.new_from_file(self.uri, None)
        pages_count = document.get_n_pages()

        _, tmp_path = tempfile.mkstemp()
        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)

        for pagenum in range(pages_count):
            page = document.get_page(pagenum)
            if page is None:  # pragma: no cover
                logging.error("Unable to get PDF pages")
                return False
            page_width, page_height = page.get_size()
            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)

            width = int(page_width * self.__scale)
            height = int(page_height * self.__scale)
            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
            img_context = cairo.Context(img_surface)

            img_context.scale(self.__scale, self.__scale)
            page.render_for_printing(img_context)
            img_context.show_page()

            buf = io.BytesIO()
            img_surface.write_to_png(buf)
            img_surface.finish()
            buf.seek(0)

            img = cairo.ImageSurface.create_from_png(buf)
            if cairo.version_info < (1, 12, 0):
                pdf_surface.set_size(width, height)
            else:
                pdf_surface.set_size(page_width, page_height)
                pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
            pdf_context.set_source_surface(img, 0, 0)
            pdf_context.paint()
            pdf_context.show_page()  # draw pdf_context on pdf_surface

        pdf_surface.finish()

        # Removes metadata added by Poppler
        self.__remove_superficial_meta(tmp_path, self.output_filename)
        os.remove(tmp_path)

        return True

    @staticmethod
    def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
        document = Poppler.Document.new_from_file('file://' + in_file)
        document.set_producer('')
        document.set_creator('')
        document.set_creation_date(-1)
        document.save('file://' + os.path.abspath(out_file))

        # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
        # fails to remove them, we have to use this terrible regex.
        # It should(tm) be alright though, because cairo's output format
        # for metadata is fixed.
        with open(out_file, 'rb') as f:
            out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
                         re.DOTALL | re.IGNORECASE)
        with open(out_file, 'wb') as f:
            f.write(out)

        return True

    @staticmethod
    def __parse_metadata_field(data: str) -> Dict[str, str]:
        metadata = {}
        for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
            metadata[key] = value
        return metadata

    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        """ Return a dict with all the meta of the file
        """
        metadata = {}
        document = Poppler.Document.new_from_file(self.uri, None)

        for key in self.meta_list:
            if document.get_property(key):
                metadata[key] = document.get_property(key)
        if 'metadata' in metadata:
            parsed_meta = self.__parse_metadata_field(metadata['metadata'])
            for key, value in parsed_meta.items():
                metadata[key] = value
        return metadata
First commit 2018-03-06 23:20:18 +01:00			`""" Handle PDF`

			`"""`

			`import os`
Improve the way we parse/display pdf metadata 2018-04-11 23:20:59 +02:00			`import re`
First commit 2018-03-06 23:20:18 +01:00			`import logging`
			`import tempfile`
			`import io`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`from typing import Union, Dict`
First commit 2018-03-06 23:20:18 +01:00
			`import cairo`
			`import gi`
			`gi.require_version('Poppler', '0.18')`
Test for faulty files, and document how MAT2 is behaving wrt. them 2018-05-06 21:58:31 +02:00			`from gi.repository import Poppler, GLib`
First commit 2018-03-06 23:20:18 +01:00
			`from . import abstract`

Fix the PDF version This should prevent the testsuite from breaking, and marginally increase fingerprinting resistance. 2022-03-28 22:34:57 +02:00			`FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5`
First commit 2018-03-06 23:20:18 +01:00
Fix the type annotations 2023-01-28 16:57:20 +01:00
First commit 2018-03-06 23:20:18 +01:00			`class PDFParser(abstract.AbstractParser):`
Implement mimetype detection 2018-03-19 23:43:49 +01:00			`mimetypes = {'application/pdf', }`
			`meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',`
Do a pylint pass 2018-05-16 22:36:59 +02:00			`'metadata', 'mod-date', 'producer', 'subject', 'title',`
			`'viewer-preferences'}`
Implement mimetype detection 2018-03-19 23:43:49 +01:00
First commit 2018-03-06 23:20:18 +01:00			`def __init__(self, filename):`
			`super().__init__(filename)`
			`self.uri = 'file://' + os.path.abspath(self.filename)`
Fix pdf issues on printers pyCairo by default renders the PDF surfaces with a resolution of 72 dpi which is so low that the bitmap gets blurred compared to original. Since pyCairo 1.12.0, a new method set_device_scale(x_scale, y_scale) is added, which allows changing the canvas resolution. 2021-07-25 14:10:36 +02:00			`self.__scale = 200 / 72.0 # how much precision do we want for the render`
Test for faulty files, and document how MAT2 is behaving wrt. them 2018-05-06 21:58:31 +02:00			`try: # Check now that the file is valid, to avoid surprises later`
			`Poppler.Document.new_from_file(self.uri, None)`
			`except GLib.GError: # Invalid PDF`
			`raise ValueError`
First commit 2018-03-06 23:20:18 +01:00
Refactor lightweight mode implementation 2018-10-12 11:49:24 +02:00			`def remove_all(self) -> bool:`
			`if self.lightweight_cleaning is True:`
			`return self.__remove_all_lightweight()`
			`return self.__remove_all_thorough()`

			`def __remove_all_lightweight(self) -> bool:`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`"""`
			`Load the document into Poppler, render pages on a new PDFSurface.`
			`"""`
			`document = Poppler.Document.new_from_file(self.uri, None)`
			`pages_count = document.get_n_pages()`

			`tmp_path = tempfile.mkstemp()[1]`
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway`
Fix the PDF version This should prevent the testsuite from breaking, and marginally increase fingerprinting resistance. 2022-03-28 22:34:57 +02:00			`pdf_surface.restrict_to_version(FIXED_PDF_VERSION)`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`pdf_context = cairo.Context(pdf_surface) # context draws on the surface`

			`for pagenum in range(pages_count):`
			`logging.info("Rendering page %d/%d", pagenum + 1, pages_count)`
			`page = document.get_page(pagenum)`
			`page_width, page_height = page.get_size()`
			`pdf_surface.set_size(page_width, page_height)`
			`pdf_context.save()`
			`page.render_for_printing(pdf_context)`
			`pdf_context.restore()`
			`pdf_context.show_page() # draw pdf_context on pdf_surface`
			`pdf_surface.finish()`

			`self.__remove_superficial_meta(tmp_path, self.output_filename)`
			`os.remove(tmp_path)`

			`return True`

Refactor lightweight mode implementation 2018-10-12 11:49:24 +02:00			`def __remove_all_thorough(self) -> bool:`
First commit 2018-03-06 23:20:18 +01:00			`"""`
			`Load the document into Poppler, render pages on PNG,`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`and shove those PNG into a new PDF.`
First commit 2018-03-06 23:20:18 +01:00			`"""`
Implement mimetype detection 2018-03-19 23:43:49 +01:00			`document = Poppler.Document.new_from_file(self.uri, None)`
Clean metadata 2018-03-18 23:48:14 +01:00			`pages_count = document.get_n_pages()`
First commit 2018-03-06 23:20:18 +01:00
Clean metadata 2018-03-18 23:48:14 +01:00			`_, tmp_path = tempfile.mkstemp()`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway`
Fix the PDF version This should prevent the testsuite from breaking, and marginally increase fingerprinting resistance. 2022-03-28 22:34:57 +02:00			`pdf_surface.restrict_to_version(FIXED_PDF_VERSION)`
First commit 2018-03-06 23:20:18 +01:00			`pdf_context = cairo.Context(pdf_surface)`

Clean metadata 2018-03-18 23:48:14 +01:00			`for pagenum in range(pages_count):`
First commit 2018-03-06 23:20:18 +01:00			`page = document.get_page(pagenum)`
Bump coverage 2020-11-13 17:27:23 +01:00			`if page is None: # pragma: no cover`
Better handling of malformed pdf 2020-11-06 16:05:24 +01:00			`logging.error("Unable to get PDF pages")`
			`return False`
First commit 2018-03-06 23:20:18 +01:00			`page_width, page_height = page.get_size()`
Clean metadata 2018-03-18 23:48:14 +01:00			`logging.info("Rendering page %d/%d", pagenum + 1, pages_count)`
First commit 2018-03-06 23:20:18 +01:00
Fix pdf issues on printers pyCairo by default renders the PDF surfaces with a resolution of 72 dpi which is so low that the bitmap gets blurred compared to original. Since pyCairo 1.12.0, a new method set_device_scale(x_scale, y_scale) is added, which allows changing the canvas resolution. 2021-07-25 14:10:36 +02:00			`width = int(page_width * self.__scale)`
			`height = int(page_height * self.__scale)`
Improve a bit the formatting of the code thanks to pyflakes3 2018-07-02 00:22:05 +02:00			`img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)`
First commit 2018-03-06 23:20:18 +01:00			`img_context = cairo.Context(img_surface)`

Scale for PDF is now parametrable 2018-03-19 23:51:35 +01:00			`img_context.scale(self.__scale, self.__scale)`
Clean metadata 2018-03-18 23:48:14 +01:00			`page.render_for_printing(img_context)`
First commit 2018-03-06 23:20:18 +01:00			`img_context.show_page()`

			`buf = io.BytesIO()`
			`img_surface.write_to_png(buf)`
			`img_surface.finish()`
			`buf.seek(0)`

			`img = cairo.ImageSurface.create_from_png(buf)`
Fix pdf issues on printers pyCairo by default renders the PDF surfaces with a resolution of 72 dpi which is so low that the bitmap gets blurred compared to original. Since pyCairo 1.12.0, a new method set_device_scale(x_scale, y_scale) is added, which allows changing the canvas resolution. 2021-07-25 14:10:36 +02:00			`if cairo.version_info < (1, 12, 0):`
			`pdf_surface.set_size(width, height)`
			`else:`
			`pdf_surface.set_size(page_width, page_height)`
			`pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)`
First commit 2018-03-06 23:20:18 +01:00			`pdf_context.set_source_surface(img, 0, 0)`
			`pdf_context.paint()`
Improve the code's documentation 2018-07-19 23:10:27 +02:00			`pdf_context.show_page() # draw pdf_context on pdf_surface`
First commit 2018-03-06 23:20:18 +01:00
			`pdf_surface.finish()`

Clean up the code for PDF handling 2018-04-02 23:36:56 +02:00			`# Removes metadata added by Poppler`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`self.__remove_superficial_meta(tmp_path, self.output_filename)`
Clean metadata 2018-03-18 23:48:14 +01:00			`os.remove(tmp_path)`
First commit 2018-03-06 23:20:18 +01:00
			`return True`

Do a pylint pass 2018-05-16 22:36:59 +02:00			`@staticmethod`
			`def __remove_superficial_meta(in_file: str, out_file: str) -> bool:`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`document = Poppler.Document.new_from_file('file://' + in_file)`
			`document.set_producer('')`
			`document.set_creator('')`
Remove a leftover pdf metadata 2018-05-14 22:44:17 +02:00			`document.set_creation_date(-1)`
Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`document.save('file://' + os.path.abspath(out_file))`
Remove a couple of residual metadata in pdf This commit takes care of removing residual metadata added by mat2 during the cleaning of pdf. 2020-02-08 16:08:32 +01:00
			`# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes`
			`# fails to remove them, we have to use this terrible regex.`
			`# It should(tm) be alright though, because cairo's output format`
			`# for metadata is fixed.`
			`with open(out_file, 'rb') as f:`
			`out = re.sub(rb'<<[\s\n]/Producer.?>>', b' << >>', f.read(), 0,`
			`re.DOTALL \| re.IGNORECASE)`
			`with open(out_file, 'wb') as f:`
			`f.write(out)`

Add lightweight processing for PDF 2018-04-14 21:23:31 +02:00			`return True`

Do a pylint pass 2018-05-16 22:36:59 +02:00			`@staticmethod`
Fix the type annotations 2023-01-28 16:57:20 +01:00			`def __parse_metadata_field(data: str) -> Dict[str, str]:`
Improve the way we parse/display pdf metadata 2018-04-11 23:20:59 +02:00			`metadata = {}`
			`for (_, key, value) in re.findall(r"<(xmp\|pdfx\|pdf\|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):`
			`metadata[key] = value`
			`return metadata`

Fix the type annotations 2023-01-28 16:57:20 +01:00			`def get_meta(self) -> Dict[str, Union[str, Dict]]:`
First commit 2018-03-06 23:20:18 +01:00			`""" Return a dict with all the meta of the file`
			`"""`
			`metadata = {}`
Test for faulty files, and document how MAT2 is behaving wrt. them 2018-05-06 21:58:31 +02:00			`document = Poppler.Document.new_from_file(self.uri, None)`

First commit 2018-03-06 23:20:18 +01:00			`for key in self.meta_list:`
			`if document.get_property(key):`
			`metadata[key] = document.get_property(key)`
Improve the way we parse/display pdf metadata 2018-04-11 23:20:59 +02:00			`if 'metadata' in metadata:`
Do a pylint pass 2018-05-16 22:36:59 +02:00			`parsed_meta = self.__parse_metadata_field(metadata['metadata'])`
Add more typing and use mypy in the CI 2018-06-04 22:54:01 +02:00			`for key, value in parsed_meta.items():`
			`metadata[key] = value`
First commit 2018-03-06 23:20:18 +01:00			`return metadata`