1
0
Fork 0
mat2/src/parsers/pdf.py

89 lines
2.8 KiB
Python
Raw Normal View History

2018-03-06 23:20:18 +01:00
""" Handle PDF
"""
import os
import logging
import tempfile
import shutil
import io
2018-03-18 23:48:14 +01:00
import tempfile
2018-03-06 23:20:18 +01:00
import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler
from . import abstract
logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
2018-03-18 23:48:14 +01:00
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
2018-03-06 23:20:18 +01:00
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self):
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
2018-03-18 23:48:14 +01:00
pages_count = document.get_n_pages()
2018-03-06 23:20:18 +01:00
2018-03-18 23:48:14 +01:00
_, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
2018-03-06 23:20:18 +01:00
pdf_context = cairo.Context(pdf_surface)
2018-03-18 23:48:14 +01:00
for pagenum in range(pages_count):
2018-03-06 23:20:18 +01:00
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
2018-03-18 23:48:14 +01:00
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
2018-03-06 23:20:18 +01:00
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
img_context = cairo.Context(img_surface)
img_context.scale(2, 2)
2018-03-18 23:48:14 +01:00
page.render_for_printing(img_context)
2018-03-06 23:20:18 +01:00
img_context.show_page()
buf = io.BytesIO()
img_surface.write_to_png(buf)
img_surface.finish()
buf.seek(0)
img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*2, page_height*2)
pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint()
pdf_context.show_page()
pdf_surface.finish()
2018-03-18 23:48:14 +01:00
# This is removing metadata added by Poppler
document = Poppler.Document.new_from_file('file://' + tmp_path)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(self.output_filename))
os.remove(tmp_path)
2018-03-06 23:20:18 +01:00
return True
def get_meta(self):
""" Return a dict with all the meta of the file
"""
print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password)
metadata = {}
for key in self.meta_list:
if document.get_property(key):
metadata[key] = document.get_property(key)
return metadata