1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Clean metadata

This commit is contained in:
jvoisin 2018-03-18 23:48:14 +01:00
parent df3c27d79d
commit acb9b2d14e
2 changed files with 18 additions and 24 deletions

View File

@ -7,17 +7,13 @@ import logging
import tempfile import tempfile
import shutil import shutil
import io import io
import tempfile
import cairo import cairo
import gi import gi
gi.require_version('Poppler', '0.18') gi.require_version('Poppler', '0.18')
from gi.repository import Poppler from gi.repository import Poppler
try:
from PIL import Image
except ImportError:
Image = None
from . import abstract from . import abstract
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser): class PDFParser(abstract.AbstractParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.meta_list = {'title', 'author', 'subject', self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'keywords', 'creator', 'producer', 'metadata'} 'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None self.password = None
@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
and shove those PNG into a new PDF. Metadata from the new and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering. cairo to not add "created by cairo" during rendering.
TODO: Improve the resolution
TODO: Don't use a temp file
""" """
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, self.password)
pages_count = document.get_n_pages()
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) _, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
pdf_context = cairo.Context(pdf_surface) pdf_context = cairo.Context(pdf_surface)
for pagenum in range(document.get_n_pages()): for pagenum in range(pages_count):
page = document.get_page(pagenum) page = document.get_page(pagenum)
page_width, page_height = page.get_size() page_width, page_height = page.get_size()
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
img_context = cairo.Context(img_surface) img_context = cairo.Context(img_surface)
img_context.scale(2, 2) img_context.scale(2, 2)
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) page.render_for_printing(img_context)
img_context.show_page() img_context.show_page()
buf = io.BytesIO() buf = io.BytesIO()
@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
img_surface.finish() img_surface.finish()
buf.seek(0) buf.seek(0)
#buf = self.__optimize_image_size(buf)
img = cairo.ImageSurface.create_from_png(buf) img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*2, page_height*2) pdf_surface.set_size(page_width*2, page_height*2)
pdf_context.set_source_surface(img, 0, 0) pdf_context.set_source_surface(img, 0, 0)
@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish() pdf_surface.finish()
# This is removing metadata # This is removing metadata added by Poppler
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) document = Poppler.Document.new_from_file('file://' + tmp_path)
#document.set_producer('totally not MAT2 ;)') document.set_producer('')
#document.set_creator('') document.set_creator('')
#document.save('file://' + os.path.abspath("OUT_clean.pdf")) document.save('file://' + os.path.abspath(self.output_filename))
os.remove(tmp_path)
return True return True

View File

@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
self.assertTrue(ret) self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
'producer': 'cairo 1.14.10 (http://cairographics.org)'} self.assertEqual(p.get_meta(), expected_meta)
self.assertEqual(p.get_meta(), remaining_meta)