2018-03-06 23:20:18 +01:00
|
|
|
""" Handle PDF
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
2018-04-11 23:20:59 +02:00
|
|
|
import re
|
2018-03-06 23:20:18 +01:00
|
|
|
import logging
|
|
|
|
import tempfile
|
|
|
|
import io
|
2023-01-28 16:57:20 +01:00
|
|
|
from typing import Union, Dict
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
import cairo
|
|
|
|
import gi
|
|
|
|
gi.require_version('Poppler', '0.18')
|
2018-05-06 21:58:31 +02:00
|
|
|
from gi.repository import Poppler, GLib
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
from . import abstract
|
|
|
|
|
2022-03-28 22:34:57 +02:00
|
|
|
FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
class PDFParser(abstract.AbstractParser):
|
2018-03-19 23:43:49 +01:00
|
|
|
mimetypes = {'application/pdf', }
|
|
|
|
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
|
2018-05-16 22:36:59 +02:00
|
|
|
'metadata', 'mod-date', 'producer', 'subject', 'title',
|
|
|
|
'viewer-preferences'}
|
2018-03-19 23:43:49 +01:00
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
def __init__(self, filename):
|
|
|
|
super().__init__(filename)
|
|
|
|
self.uri = 'file://' + os.path.abspath(self.filename)
|
2021-07-25 14:10:36 +02:00
|
|
|
self.__scale = 200 / 72.0 # how much precision do we want for the render
|
2018-05-06 21:58:31 +02:00
|
|
|
try: # Check now that the file is valid, to avoid surprises later
|
|
|
|
Poppler.Document.new_from_file(self.uri, None)
|
|
|
|
except GLib.GError: # Invalid PDF
|
|
|
|
raise ValueError
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-10-12 11:49:24 +02:00
|
|
|
def remove_all(self) -> bool:
|
|
|
|
if self.lightweight_cleaning is True:
|
2023-09-07 16:31:34 +02:00
|
|
|
try:
|
|
|
|
return self.__remove_all_lightweight()
|
|
|
|
except cairo.Error as e:
|
|
|
|
raise RuntimeError(e)
|
2018-10-12 11:49:24 +02:00
|
|
|
return self.__remove_all_thorough()
|
|
|
|
|
|
|
|
def __remove_all_lightweight(self) -> bool:
|
2018-04-14 21:23:31 +02:00
|
|
|
"""
|
|
|
|
Load the document into Poppler, render pages on a new PDFSurface.
|
|
|
|
"""
|
|
|
|
document = Poppler.Document.new_from_file(self.uri, None)
|
|
|
|
pages_count = document.get_n_pages()
|
|
|
|
|
|
|
|
tmp_path = tempfile.mkstemp()[1]
|
2018-07-19 23:10:27 +02:00
|
|
|
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
|
2022-03-28 22:34:57 +02:00
|
|
|
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
|
2018-04-14 21:23:31 +02:00
|
|
|
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
|
|
|
|
|
|
|
|
for pagenum in range(pages_count):
|
|
|
|
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
|
|
|
page = document.get_page(pagenum)
|
|
|
|
page_width, page_height = page.get_size()
|
|
|
|
pdf_surface.set_size(page_width, page_height)
|
|
|
|
pdf_context.save()
|
|
|
|
page.render_for_printing(pdf_context)
|
|
|
|
pdf_context.restore()
|
|
|
|
pdf_context.show_page() # draw pdf_context on pdf_surface
|
|
|
|
pdf_surface.finish()
|
|
|
|
|
|
|
|
self.__remove_superficial_meta(tmp_path, self.output_filename)
|
|
|
|
os.remove(tmp_path)
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
2018-10-12 11:49:24 +02:00
|
|
|
def __remove_all_thorough(self) -> bool:
|
2018-03-06 23:20:18 +01:00
|
|
|
"""
|
|
|
|
Load the document into Poppler, render pages on PNG,
|
2018-04-14 21:23:31 +02:00
|
|
|
and shove those PNG into a new PDF.
|
2018-03-06 23:20:18 +01:00
|
|
|
"""
|
2018-03-19 23:43:49 +01:00
|
|
|
document = Poppler.Document.new_from_file(self.uri, None)
|
2018-03-18 23:48:14 +01:00
|
|
|
pages_count = document.get_n_pages()
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-03-18 23:48:14 +01:00
|
|
|
_, tmp_path = tempfile.mkstemp()
|
2018-04-14 21:23:31 +02:00
|
|
|
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
|
2022-03-28 22:34:57 +02:00
|
|
|
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
|
2018-03-06 23:20:18 +01:00
|
|
|
pdf_context = cairo.Context(pdf_surface)
|
|
|
|
|
2018-03-18 23:48:14 +01:00
|
|
|
for pagenum in range(pages_count):
|
2018-03-06 23:20:18 +01:00
|
|
|
page = document.get_page(pagenum)
|
2020-11-13 17:27:23 +01:00
|
|
|
if page is None: # pragma: no cover
|
2020-11-06 16:05:24 +01:00
|
|
|
logging.error("Unable to get PDF pages")
|
|
|
|
return False
|
2018-03-06 23:20:18 +01:00
|
|
|
page_width, page_height = page.get_size()
|
2018-03-18 23:48:14 +01:00
|
|
|
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2021-07-25 14:10:36 +02:00
|
|
|
width = int(page_width * self.__scale)
|
|
|
|
height = int(page_height * self.__scale)
|
2018-07-02 00:22:05 +02:00
|
|
|
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
|
2018-03-06 23:20:18 +01:00
|
|
|
img_context = cairo.Context(img_surface)
|
|
|
|
|
2018-03-19 23:51:35 +01:00
|
|
|
img_context.scale(self.__scale, self.__scale)
|
2018-03-18 23:48:14 +01:00
|
|
|
page.render_for_printing(img_context)
|
2018-03-06 23:20:18 +01:00
|
|
|
img_context.show_page()
|
|
|
|
|
|
|
|
buf = io.BytesIO()
|
|
|
|
img_surface.write_to_png(buf)
|
|
|
|
img_surface.finish()
|
|
|
|
buf.seek(0)
|
|
|
|
|
|
|
|
img = cairo.ImageSurface.create_from_png(buf)
|
2021-07-25 14:10:36 +02:00
|
|
|
if cairo.version_info < (1, 12, 0):
|
|
|
|
pdf_surface.set_size(width, height)
|
|
|
|
else:
|
|
|
|
pdf_surface.set_size(page_width, page_height)
|
|
|
|
pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
|
2018-03-06 23:20:18 +01:00
|
|
|
pdf_context.set_source_surface(img, 0, 0)
|
|
|
|
pdf_context.paint()
|
2018-07-19 23:10:27 +02:00
|
|
|
pdf_context.show_page() # draw pdf_context on pdf_surface
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
pdf_surface.finish()
|
|
|
|
|
2018-04-02 23:36:56 +02:00
|
|
|
# Removes metadata added by Poppler
|
2018-04-14 21:23:31 +02:00
|
|
|
self.__remove_superficial_meta(tmp_path, self.output_filename)
|
2018-03-18 23:48:14 +01:00
|
|
|
os.remove(tmp_path)
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
2018-05-16 22:36:59 +02:00
|
|
|
@staticmethod
|
|
|
|
def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
|
2018-04-14 21:23:31 +02:00
|
|
|
document = Poppler.Document.new_from_file('file://' + in_file)
|
|
|
|
document.set_producer('')
|
|
|
|
document.set_creator('')
|
2018-05-14 22:44:17 +02:00
|
|
|
document.set_creation_date(-1)
|
2018-04-14 21:23:31 +02:00
|
|
|
document.save('file://' + os.path.abspath(out_file))
|
2020-02-08 16:08:32 +01:00
|
|
|
|
|
|
|
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
|
|
|
|
# fails to remove them, we have to use this terrible regex.
|
|
|
|
# It should(tm) be alright though, because cairo's output format
|
|
|
|
# for metadata is fixed.
|
|
|
|
with open(out_file, 'rb') as f:
|
|
|
|
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
|
|
|
|
re.DOTALL | re.IGNORECASE)
|
|
|
|
with open(out_file, 'wb') as f:
|
|
|
|
f.write(out)
|
|
|
|
|
2018-04-14 21:23:31 +02:00
|
|
|
return True
|
|
|
|
|
2018-05-16 22:36:59 +02:00
|
|
|
@staticmethod
|
2023-01-28 16:57:20 +01:00
|
|
|
def __parse_metadata_field(data: str) -> Dict[str, str]:
|
2018-04-11 23:20:59 +02:00
|
|
|
metadata = {}
|
|
|
|
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
|
|
|
|
metadata[key] = value
|
|
|
|
return metadata
|
|
|
|
|
2023-01-28 16:57:20 +01:00
|
|
|
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
2018-03-06 23:20:18 +01:00
|
|
|
""" Return a dict with all the meta of the file
|
|
|
|
"""
|
|
|
|
metadata = {}
|
2018-05-06 21:58:31 +02:00
|
|
|
document = Poppler.Document.new_from_file(self.uri, None)
|
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
for key in self.meta_list:
|
|
|
|
if document.get_property(key):
|
|
|
|
metadata[key] = document.get_property(key)
|
2018-04-11 23:20:59 +02:00
|
|
|
if 'metadata' in metadata:
|
2018-05-16 22:36:59 +02:00
|
|
|
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
|
2018-06-04 22:54:01 +02:00
|
|
|
for key, value in parsed_meta.items():
|
|
|
|
metadata[key] = value
|
2018-03-06 23:20:18 +01:00
|
|
|
return metadata
|