106 lines
3.3 KiB
Python
106 lines
3.3 KiB
Python
|
""" Handle PDF
|
||
|
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
import logging
|
||
|
import tempfile
|
||
|
import shutil
|
||
|
import io
|
||
|
|
||
|
import cairo
|
||
|
import gi
|
||
|
gi.require_version('Poppler', '0.18')
|
||
|
from gi.repository import Poppler, Gio, GLib
|
||
|
|
||
|
try:
|
||
|
from PIL import Image
|
||
|
except ImportError:
|
||
|
Image = None
|
||
|
|
||
|
from . import abstract
|
||
|
|
||
|
logging.basicConfig(level=logging.DEBUG)
|
||
|
|
||
|
|
||
|
class PDFParser(abstract.AbstractParser):
|
||
|
def __init__(self, filename):
|
||
|
super().__init__(filename)
|
||
|
self.meta_list = {'title', 'author', 'subject',
|
||
|
'keywords', 'creator', 'producer', 'metadata'}
|
||
|
self.uri = 'file://' + os.path.abspath(self.filename)
|
||
|
self.password = None
|
||
|
|
||
|
def remove_all(self):
|
||
|
"""
|
||
|
Load the document into Poppler, render pages on PNG,
|
||
|
and shove those PNG into a new PDF. Metadata from the new
|
||
|
PDF are removed via Poppler, because there is no way to tell
|
||
|
cairo to not add "created by cairo" during rendering.
|
||
|
|
||
|
TODO: Improve the resolution
|
||
|
TODO: Don't use a temp file
|
||
|
"""
|
||
|
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||
|
|
||
|
pdf_out = io.BytesIO()
|
||
|
pdf_surface = cairo.PDFSurface(pdf_out, 128, 128)
|
||
|
pdf_context = cairo.Context(pdf_surface)
|
||
|
|
||
|
for pagenum in range(document.get_n_pages()):
|
||
|
page = document.get_page(pagenum)
|
||
|
page_width, page_height = page.get_size()
|
||
|
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
|
||
|
|
||
|
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
|
||
|
img_context = cairo.Context(img_surface)
|
||
|
|
||
|
img_context.scale(2, 2)
|
||
|
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
|
||
|
img_context.show_page()
|
||
|
|
||
|
buf = io.BytesIO()
|
||
|
img_surface.write_to_png(buf)
|
||
|
img_surface.finish()
|
||
|
buf.seek(0)
|
||
|
|
||
|
img = cairo.ImageSurface.create_from_png(buf)
|
||
|
pdf_surface.set_size(page_width*2, page_height*2)
|
||
|
pdf_context.set_source_surface(img, 0, 0)
|
||
|
pdf_context.paint()
|
||
|
pdf_context.show_page()
|
||
|
|
||
|
pdf_surface.finish()
|
||
|
|
||
|
b = GLib.Bytes(pdf_out.getvalue())
|
||
|
input_stream = Gio.MemoryInputStream.new_from_bytes(b)
|
||
|
out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None)
|
||
|
metadata = {}
|
||
|
for key in self.meta_list:
|
||
|
if out_document.get_property(key):
|
||
|
metadata[key] = str(out_document.get_property(key))
|
||
|
out_document.set_producer('totally not MAT2 ;)')
|
||
|
out_document.set_creator('')
|
||
|
print("AFTER")
|
||
|
metadata = {}
|
||
|
for key in self.meta_list:
|
||
|
if out_document.get_property(key):
|
||
|
metadata[key] = str(out_document.get_property(key))
|
||
|
print("LOL")
|
||
|
out_document.save('file://' + os.path.abspath("olol.pdf"))
|
||
|
|
||
|
print(metadata)
|
||
|
|
||
|
return True
|
||
|
|
||
|
def get_meta(self):
|
||
|
""" Return a dict with all the meta of the file
|
||
|
"""
|
||
|
print("URI: %s", self.uri)
|
||
|
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||
|
metadata = {}
|
||
|
for key in self.meta_list:
|
||
|
if document.get_property(key):
|
||
|
metadata[key] = str(document.get_property(key))
|
||
|
return metadata
|