2018-03-06 23:20:18 +01:00
|
|
|
""" Handle PDF
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
import tempfile
|
|
|
|
import shutil
|
|
|
|
import io
|
2018-03-18 23:48:14 +01:00
|
|
|
import tempfile
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
import cairo
|
|
|
|
import gi
|
|
|
|
gi.require_version('Poppler', '0.18')
|
|
|
|
from gi.repository import Poppler
|
|
|
|
|
|
|
|
from . import abstract
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
|
|
|
|
|
|
class PDFParser(abstract.AbstractParser):
|
2018-03-19 23:43:49 +01:00
|
|
|
mimetypes = {'application/pdf', }
|
|
|
|
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
|
|
|
|
'metadata', 'mod-date', 'producer', 'subject', 'title',
|
|
|
|
'viewer-preferences'}
|
|
|
|
|
2018-03-06 23:20:18 +01:00
|
|
|
def __init__(self, filename):
|
|
|
|
super().__init__(filename)
|
|
|
|
self.uri = 'file://' + os.path.abspath(self.filename)
|
2018-03-19 23:51:35 +01:00
|
|
|
self.__scale = 2
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
def remove_all(self):
|
|
|
|
"""
|
|
|
|
Load the document into Poppler, render pages on PNG,
|
|
|
|
and shove those PNG into a new PDF. Metadata from the new
|
|
|
|
PDF are removed via Poppler, because there is no way to tell
|
|
|
|
cairo to not add "created by cairo" during rendering.
|
|
|
|
"""
|
2018-03-19 23:43:49 +01:00
|
|
|
document = Poppler.Document.new_from_file(self.uri, None)
|
2018-03-18 23:48:14 +01:00
|
|
|
pages_count = document.get_n_pages()
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-03-18 23:48:14 +01:00
|
|
|
_, tmp_path = tempfile.mkstemp()
|
|
|
|
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
|
2018-03-06 23:20:18 +01:00
|
|
|
pdf_context = cairo.Context(pdf_surface)
|
|
|
|
|
2018-03-18 23:48:14 +01:00
|
|
|
for pagenum in range(pages_count):
|
2018-03-06 23:20:18 +01:00
|
|
|
page = document.get_page(pagenum)
|
|
|
|
page_width, page_height = page.get_size()
|
2018-03-18 23:48:14 +01:00
|
|
|
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
2018-03-06 23:20:18 +01:00
|
|
|
|
2018-03-19 23:51:35 +01:00
|
|
|
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale)
|
2018-03-06 23:20:18 +01:00
|
|
|
img_context = cairo.Context(img_surface)
|
|
|
|
|
2018-03-19 23:51:35 +01:00
|
|
|
img_context.scale(self.__scale, self.__scale)
|
2018-03-18 23:48:14 +01:00
|
|
|
page.render_for_printing(img_context)
|
2018-03-06 23:20:18 +01:00
|
|
|
img_context.show_page()
|
|
|
|
|
|
|
|
buf = io.BytesIO()
|
|
|
|
img_surface.write_to_png(buf)
|
|
|
|
img_surface.finish()
|
|
|
|
buf.seek(0)
|
|
|
|
|
|
|
|
img = cairo.ImageSurface.create_from_png(buf)
|
|
|
|
pdf_surface.set_size(page_width*2, page_height*2)
|
|
|
|
pdf_context.set_source_surface(img, 0, 0)
|
|
|
|
pdf_context.paint()
|
|
|
|
pdf_context.show_page()
|
|
|
|
|
|
|
|
pdf_surface.finish()
|
|
|
|
|
2018-03-18 23:48:14 +01:00
|
|
|
# This is removing metadata added by Poppler
|
|
|
|
document = Poppler.Document.new_from_file('file://' + tmp_path)
|
|
|
|
document.set_producer('')
|
|
|
|
document.set_creator('')
|
|
|
|
document.save('file://' + os.path.abspath(self.output_filename))
|
|
|
|
os.remove(tmp_path)
|
2018-03-06 23:20:18 +01:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def get_meta(self):
|
|
|
|
""" Return a dict with all the meta of the file
|
|
|
|
"""
|
2018-03-19 23:43:49 +01:00
|
|
|
document = Poppler.Document.new_from_file(self.uri, None)
|
2018-03-06 23:20:18 +01:00
|
|
|
metadata = {}
|
|
|
|
for key in self.meta_list:
|
|
|
|
if document.get_property(key):
|
|
|
|
metadata[key] = document.get_property(key)
|
|
|
|
return metadata
|