1
0
mirror of synced 2024-11-22 17:24:23 +01:00
mat2/libmat2/pdf.py

164 lines
5.9 KiB
Python
Raw Normal View History

2018-03-06 23:20:18 +01:00
""" Handle PDF
"""
import os
import re
2018-03-06 23:20:18 +01:00
import logging
import tempfile
import io
2023-01-28 16:57:20 +01:00
from typing import Union, Dict
2018-03-06 23:20:18 +01:00
import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler, GLib
2018-03-06 23:20:18 +01:00
from . import abstract
FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
2018-03-06 23:20:18 +01:00
2023-01-28 16:57:20 +01:00
2018-03-06 23:20:18 +01:00
class PDFParser(abstract.AbstractParser):
2018-03-19 23:43:49 +01:00
mimetypes = {'application/pdf', }
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
2018-05-16 22:36:59 +02:00
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
2018-03-19 23:43:49 +01:00
2018-03-06 23:20:18 +01:00
def __init__(self, filename):
super().__init__(filename)
self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 200 / 72.0 # how much precision do we want for the render
try: # Check now that the file is valid, to avoid surprises later
Poppler.Document.new_from_file(self.uri, None)
except GLib.GError: # Invalid PDF
raise ValueError
2018-03-06 23:20:18 +01:00
def remove_all(self) -> bool:
if self.lightweight_cleaning is True:
return self.__remove_all_lightweight()
return self.__remove_all_thorough()
def __remove_all_lightweight(self) -> bool:
2018-04-14 21:23:31 +02:00
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1]
2018-07-19 23:10:27 +02:00
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
2018-04-14 21:23:31 +02:00
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count):
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
pdf_surface.set_size(page_width, page_height)
pdf_context.save()
page.render_for_printing(pdf_context)
pdf_context.restore()
pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish()
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def __remove_all_thorough(self) -> bool:
2018-03-06 23:20:18 +01:00
"""
Load the document into Poppler, render pages on PNG,
2018-04-14 21:23:31 +02:00
and shove those PNG into a new PDF.
2018-03-06 23:20:18 +01:00
"""
2018-03-19 23:43:49 +01:00
document = Poppler.Document.new_from_file(self.uri, None)
2018-03-18 23:48:14 +01:00
pages_count = document.get_n_pages()
2018-03-06 23:20:18 +01:00
2018-03-18 23:48:14 +01:00
_, tmp_path = tempfile.mkstemp()
2018-04-14 21:23:31 +02:00
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
2018-03-06 23:20:18 +01:00
pdf_context = cairo.Context(pdf_surface)
2018-03-18 23:48:14 +01:00
for pagenum in range(pages_count):
2018-03-06 23:20:18 +01:00
page = document.get_page(pagenum)
2020-11-13 17:27:23 +01:00
if page is None: # pragma: no cover
2020-11-06 16:05:24 +01:00
logging.error("Unable to get PDF pages")
return False
2018-03-06 23:20:18 +01:00
page_width, page_height = page.get_size()
2018-03-18 23:48:14 +01:00
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
2018-03-06 23:20:18 +01:00
width = int(page_width * self.__scale)
height = int(page_height * self.__scale)
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
2018-03-06 23:20:18 +01:00
img_context = cairo.Context(img_surface)
2018-03-19 23:51:35 +01:00
img_context.scale(self.__scale, self.__scale)
2018-03-18 23:48:14 +01:00
page.render_for_printing(img_context)
2018-03-06 23:20:18 +01:00
img_context.show_page()
buf = io.BytesIO()
img_surface.write_to_png(buf)
img_surface.finish()
buf.seek(0)
img = cairo.ImageSurface.create_from_png(buf)
if cairo.version_info < (1, 12, 0):
pdf_surface.set_size(width, height)
else:
pdf_surface.set_size(page_width, page_height)
pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
2018-03-06 23:20:18 +01:00
pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint()
2018-07-19 23:10:27 +02:00
pdf_context.show_page() # draw pdf_context on pdf_surface
2018-03-06 23:20:18 +01:00
pdf_surface.finish()
2018-04-02 23:36:56 +02:00
# Removes metadata added by Poppler
2018-04-14 21:23:31 +02:00
self.__remove_superficial_meta(tmp_path, self.output_filename)
2018-03-18 23:48:14 +01:00
os.remove(tmp_path)
2018-03-06 23:20:18 +01:00
return True
2018-05-16 22:36:59 +02:00
@staticmethod
def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
2018-04-14 21:23:31 +02:00
document = Poppler.Document.new_from_file('file://' + in_file)
document.set_producer('')
document.set_creator('')
2018-05-14 22:44:17 +02:00
document.set_creation_date(-1)
2018-04-14 21:23:31 +02:00
document.save('file://' + os.path.abspath(out_file))
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
# fails to remove them, we have to use this terrible regex.
# It should(tm) be alright though, because cairo's output format
# for metadata is fixed.
with open(out_file, 'rb') as f:
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
re.DOTALL | re.IGNORECASE)
with open(out_file, 'wb') as f:
f.write(out)
2018-04-14 21:23:31 +02:00
return True
2018-05-16 22:36:59 +02:00
@staticmethod
2023-01-28 16:57:20 +01:00
def __parse_metadata_field(data: str) -> Dict[str, str]:
metadata = {}
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
metadata[key] = value
return metadata
2023-01-28 16:57:20 +01:00
def get_meta(self) -> Dict[str, Union[str, Dict]]:
2018-03-06 23:20:18 +01:00
""" Return a dict with all the meta of the file
"""
metadata = {}
document = Poppler.Document.new_from_file(self.uri, None)
2018-03-06 23:20:18 +01:00
for key in self.meta_list:
if document.get_property(key):
metadata[key] = document.get_property(key)
if 'metadata' in metadata:
2018-05-16 22:36:59 +02:00
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
2018-06-04 22:54:01 +02:00
for key, value in parsed_meta.items():
metadata[key] = value
2018-03-06 23:20:18 +01:00
return metadata