Improve the testsuite
This commit is contained in:
parent
069765376d
commit
df3c27d79d
1
libmat2/__init__.py
Normal file
1
libmat2/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
__version__ = '2.0'
|
0
libmat2/parsers/__init__.py
Normal file
0
libmat2/parsers/__init__.py
Normal file
10
libmat2/parsers/abstract.py
Normal file
10
libmat2/parsers/abstract.py
Normal file
@ -0,0 +1,10 @@
|
||||
class AbstractParser(object):
|
||||
def __init__(self, filename: str):
|
||||
self.filename = filename
|
||||
self.meta_list = set()
|
||||
|
||||
def get_meta(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def remove_all(self):
|
||||
raise NotImplementedError
|
105
libmat2/parsers/pdf.py
Normal file
105
libmat2/parsers/pdf.py
Normal file
@ -0,0 +1,105 @@
|
||||
""" Handle PDF
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import tempfile
|
||||
import shutil
|
||||
import io
|
||||
|
||||
import cairo
|
||||
import gi
|
||||
gi.require_version('Poppler', '0.18')
|
||||
from gi.repository import Poppler, Gio, GLib
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None
|
||||
|
||||
from . import abstract
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
class PDFParser(abstract.AbstractParser):
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
self.meta_list = {'title', 'author', 'subject',
|
||||
'keywords', 'creator', 'producer', 'metadata'}
|
||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||
self.password = None
|
||||
|
||||
def remove_all(self):
|
||||
"""
|
||||
Load the document into Poppler, render pages on PNG,
|
||||
and shove those PNG into a new PDF. Metadata from the new
|
||||
PDF are removed via Poppler, because there is no way to tell
|
||||
cairo to not add "created by cairo" during rendering.
|
||||
|
||||
TODO: Improve the resolution
|
||||
TODO: Don't use a temp file
|
||||
"""
|
||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||||
|
||||
pdf_out = io.BytesIO()
|
||||
pdf_surface = cairo.PDFSurface(pdf_out, 128, 128)
|
||||
pdf_context = cairo.Context(pdf_surface)
|
||||
|
||||
for pagenum in range(document.get_n_pages()):
|
||||
page = document.get_page(pagenum)
|
||||
page_width, page_height = page.get_size()
|
||||
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
|
||||
|
||||
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
|
||||
img_context = cairo.Context(img_surface)
|
||||
|
||||
img_context.scale(2, 2)
|
||||
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
|
||||
img_context.show_page()
|
||||
|
||||
buf = io.BytesIO()
|
||||
img_surface.write_to_png(buf)
|
||||
img_surface.finish()
|
||||
buf.seek(0)
|
||||
|
||||
img = cairo.ImageSurface.create_from_png(buf)
|
||||
pdf_surface.set_size(page_width*2, page_height*2)
|
||||
pdf_context.set_source_surface(img, 0, 0)
|
||||
pdf_context.paint()
|
||||
pdf_context.show_page()
|
||||
|
||||
pdf_surface.finish()
|
||||
|
||||
b = GLib.Bytes(pdf_out.getvalue())
|
||||
input_stream = Gio.MemoryInputStream.new_from_bytes(b)
|
||||
out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None)
|
||||
metadata = {}
|
||||
for key in self.meta_list:
|
||||
if out_document.get_property(key):
|
||||
metadata[key] = str(out_document.get_property(key))
|
||||
out_document.set_producer('totally not MAT2 ;)')
|
||||
out_document.set_creator('')
|
||||
print("AFTER")
|
||||
metadata = {}
|
||||
for key in self.meta_list:
|
||||
if out_document.get_property(key):
|
||||
metadata[key] = str(out_document.get_property(key))
|
||||
print("LOL")
|
||||
out_document.save('file://' + os.path.abspath("olol.pdf"))
|
||||
|
||||
print(metadata)
|
||||
|
||||
return True
|
||||
|
||||
def get_meta(self):
|
||||
""" Return a dict with all the meta of the file
|
||||
"""
|
||||
print("URI: %s", self.uri)
|
||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||||
metadata = {}
|
||||
for key in self.meta_list:
|
||||
if document.get_property(key):
|
||||
metadata[key] = str(document.get_property(key))
|
||||
return metadata
|
@ -1,6 +1,7 @@
|
||||
class AbstractParser(object):
|
||||
def __init__(self, filename: str):
|
||||
self.filename = filename
|
||||
self.output_filename = filename + '.cleaned'
|
||||
self.meta_list = set()
|
||||
|
||||
def get_meta(self):
|
||||
|
@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser):
|
||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||
self.password = None
|
||||
|
||||
def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
|
||||
""" This is useless as fuck. """
|
||||
if Image is None:
|
||||
return img
|
||||
ret = io.BytesIO()
|
||||
im = Image.open(img)
|
||||
w, h = im.size
|
||||
resized = im.resize((w, h), Image.ANTIALIAS)
|
||||
resized.save(ret, optimize=True, format="PNG")
|
||||
ret.seek(0)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def remove_all(self):
|
||||
"""
|
||||
Load the document into Poppler, render pages on PNG,
|
||||
@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
"""
|
||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
||||
|
||||
pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128)
|
||||
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
|
||||
pdf_context = cairo.Context(pdf_surface)
|
||||
|
||||
for pagenum in range(document.get_n_pages()):
|
||||
@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
pdf_surface.finish()
|
||||
|
||||
document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
|
||||
document.set_producer('totally not MAT2 ;)')
|
||||
document.set_creator('')
|
||||
document.save('file://' + os.path.abspath("OUT_clean.pdf"))
|
||||
# This is removing metadata
|
||||
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
|
||||
#document.set_producer('totally not MAT2 ;)')
|
||||
#document.set_creator('')
|
||||
#document.save('file://' + os.path.abspath("OUT_clean.pdf"))
|
||||
|
||||
return True
|
||||
|
||||
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
@ -10,18 +10,27 @@ from src.parsers import pdf
|
||||
class TestGetMeta(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
p = pdf.PDFParser('./tests/data/dirty.pdf')
|
||||
meta = p.get_meta().items()
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
||||
|
||||
class TestCleaning(unittest.TestCase):
|
||||
def setUp(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
|
||||
def tearDown(self):
|
||||
#os.remove('./tests/data/clean.pdf')
|
||||
pass
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
|
||||
def test_pdf(self):
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
p.remove_all()
|
||||
#self.assertEqual(p.get_meta(), {})
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
|
||||
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
|
||||
'producer': 'cairo 1.14.10 (http://cairographics.org)'}
|
||||
self.assertEqual(p.get_meta(), remaining_meta)
|
||||
|
Loading…
Reference in New Issue
Block a user