Clean metadata
This commit is contained in:
parent
df3c27d79d
commit
acb9b2d14e
@ -7,17 +7,13 @@ import logging
|
|||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import io
|
import io
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import cairo
|
import cairo
|
||||||
import gi
|
import gi
|
||||||
gi.require_version('Poppler', '0.18')
|
gi.require_version('Poppler', '0.18')
|
||||||
from gi.repository import Poppler
|
from gi.repository import Poppler
|
||||||
|
|
||||||
try:
|
|
||||||
from PIL import Image
|
|
||||||
except ImportError:
|
|
||||||
Image = None
|
|
||||||
|
|
||||||
from . import abstract
|
from . import abstract
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
|
|||||||
class PDFParser(abstract.AbstractParser):
|
class PDFParser(abstract.AbstractParser):
|
||||||
def __init__(self, filename):
|
def __init__(self, filename):
|
||||||
super().__init__(filename)
|
super().__init__(filename)
|
||||||
self.meta_list = {'title', 'author', 'subject',
|
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
|
||||||
'keywords', 'creator', 'producer', 'metadata'}
|
'metadata', 'mod-date', 'producer', 'subject', 'title',
|
||||||
|
'viewer-preferences'}
|
||||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||||
self.password = None
|
self.password = None
|
||||||
|
|
||||||
@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
and shove those PNG into a new PDF. Metadata from the new
|
and shove those PNG into a new PDF. Metadata from the new
|
||||||
PDF are removed via Poppler, because there is no way to tell
|
PDF are removed via Poppler, because there is no way to tell
|
||||||
cairo to not add "created by cairo" during rendering.
|
cairo to not add "created by cairo" during rendering.
|
||||||
|
|
||||||
TODO: Improve the resolution
|
|
||||||
TODO: Don't use a temp file
|
|
||||||
"""
|
"""
|
||||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||||||
|
pages_count = document.get_n_pages()
|
||||||
|
|
||||||
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
|
_, tmp_path = tempfile.mkstemp()
|
||||||
|
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
|
||||||
pdf_context = cairo.Context(pdf_surface)
|
pdf_context = cairo.Context(pdf_surface)
|
||||||
|
|
||||||
for pagenum in range(document.get_n_pages()):
|
for pagenum in range(pages_count):
|
||||||
page = document.get_page(pagenum)
|
page = document.get_page(pagenum)
|
||||||
page_width, page_height = page.get_size()
|
page_width, page_height = page.get_size()
|
||||||
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
|
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
||||||
|
|
||||||
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
|
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
|
||||||
img_context = cairo.Context(img_surface)
|
img_context = cairo.Context(img_surface)
|
||||||
|
|
||||||
img_context.scale(2, 2)
|
img_context.scale(2, 2)
|
||||||
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
|
page.render_for_printing(img_context)
|
||||||
img_context.show_page()
|
img_context.show_page()
|
||||||
|
|
||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
img_surface.finish()
|
img_surface.finish()
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
|
|
||||||
#buf = self.__optimize_image_size(buf)
|
|
||||||
|
|
||||||
img = cairo.ImageSurface.create_from_png(buf)
|
img = cairo.ImageSurface.create_from_png(buf)
|
||||||
pdf_surface.set_size(page_width*2, page_height*2)
|
pdf_surface.set_size(page_width*2, page_height*2)
|
||||||
pdf_context.set_source_surface(img, 0, 0)
|
pdf_context.set_source_surface(img, 0, 0)
|
||||||
@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
|
|
||||||
pdf_surface.finish()
|
pdf_surface.finish()
|
||||||
|
|
||||||
# This is removing metadata
|
# This is removing metadata added by Poppler
|
||||||
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
|
document = Poppler.Document.new_from_file('file://' + tmp_path)
|
||||||
#document.set_producer('totally not MAT2 ;)')
|
document.set_producer('')
|
||||||
#document.set_creator('')
|
document.set_creator('')
|
||||||
#document.save('file://' + os.path.abspath("OUT_clean.pdf"))
|
document.save('file://' + os.path.abspath(self.output_filename))
|
||||||
|
os.remove(tmp_path)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
|
|||||||
self.assertTrue(ret)
|
self.assertTrue(ret)
|
||||||
|
|
||||||
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
|
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
|
||||||
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
|
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||||
'producer': 'cairo 1.14.10 (http://cairographics.org)'}
|
self.assertEqual(p.get_meta(), expected_meta)
|
||||||
self.assertEqual(p.get_meta(), remaining_meta)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user