Add lightweight processing for PDF
This commit is contained in:
parent
6f4ed2490f
commit
96299c6a53
15
main.py
15
main.py
@ -31,6 +31,8 @@ def create_arg_parser():
|
||||
help='list all supported fileformats')
|
||||
info.add_argument('-s', '--show', action='store_true',
|
||||
help='list all the harmful metadata of a file without removing them')
|
||||
info.add_argument('-L', '--lightweight', action='store_true',
|
||||
help='remove SOME metadata')
|
||||
return parser
|
||||
|
||||
|
||||
@ -50,7 +52,7 @@ def show_meta(filename:str):
|
||||
print(" %s: harmful content" % k)
|
||||
|
||||
|
||||
def clean_meta(filename:str):
|
||||
def clean_meta(filename:str, is_lightweigth:bool):
|
||||
if not __check_file(filename, os.R_OK|os.W_OK):
|
||||
return
|
||||
|
||||
@ -58,7 +60,10 @@ def clean_meta(filename:str):
|
||||
if p is None:
|
||||
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
return
|
||||
p.remove_all()
|
||||
if is_lightweigth:
|
||||
p.remove_all_lightweight()
|
||||
else:
|
||||
p.remove_all()
|
||||
|
||||
|
||||
def show_parsers():
|
||||
@ -78,12 +83,12 @@ def __get_files_recursively(files):
|
||||
for _f in _files:
|
||||
yield os.path.join(path, _f)
|
||||
|
||||
def __do_clean_async(q):
|
||||
def __do_clean_async(is_lightweigth, q):
|
||||
while True:
|
||||
f = q.get()
|
||||
if f is None: # nothing more to process
|
||||
return
|
||||
clean_meta(f)
|
||||
clean_meta(is_lightweigth, f)
|
||||
q.task_done()
|
||||
|
||||
|
||||
@ -109,7 +114,7 @@ def main():
|
||||
q.put(f)
|
||||
|
||||
for _ in range(multiprocessing.cpu_count()):
|
||||
worker = Thread(target=__do_clean_async, args=(q, ))
|
||||
worker = Thread(target=__do_clean_async, args=(mode, q))
|
||||
worker.start()
|
||||
threads.append(worker)
|
||||
|
||||
|
@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
|
||||
@abc.abstractmethod
|
||||
def remove_all(self) -> bool:
|
||||
pass
|
||||
|
||||
def remove_all_lightweight(self) -> bool:
|
||||
""" Remove _SOME_ metadata. """
|
||||
return self.remove_all()
|
||||
|
45
src/pdf.py
45
src/pdf.py
@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
|
||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||
self.__scale = 2 # how much precision do we want for the render
|
||||
|
||||
def remove_all_lightweight(self):
|
||||
"""
|
||||
Load the document into Poppler, render pages on a new PDFSurface.
|
||||
"""
|
||||
document = Poppler.Document.new_from_file(self.uri, None)
|
||||
pages_count = document.get_n_pages()
|
||||
|
||||
tmp_path = tempfile.mkstemp()[1]
|
||||
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
|
||||
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
|
||||
|
||||
for pagenum in range(pages_count):
|
||||
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
||||
page = document.get_page(pagenum)
|
||||
page_width, page_height = page.get_size()
|
||||
pdf_surface.set_size(page_width, page_height)
|
||||
pdf_context.save()
|
||||
page.render_for_printing(pdf_context)
|
||||
pdf_context.restore()
|
||||
pdf_context.show_page() # draw pdf_context on pdf_surface
|
||||
pdf_surface.finish()
|
||||
|
||||
self.__remove_superficial_meta(tmp_path, self.output_filename)
|
||||
os.remove(tmp_path)
|
||||
|
||||
return True
|
||||
|
||||
def remove_all(self):
|
||||
"""
|
||||
Load the document into Poppler, render pages on PNG,
|
||||
and shove those PNG into a new PDF. Metadata from the new
|
||||
PDF are removed via Poppler, because there is no way to tell
|
||||
cairo to not add "created by cairo" during rendering.
|
||||
and shove those PNG into a new PDF.
|
||||
"""
|
||||
document = Poppler.Document.new_from_file(self.uri, None)
|
||||
pages_count = document.get_n_pages()
|
||||
|
||||
_, tmp_path = tempfile.mkstemp()
|
||||
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
|
||||
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
|
||||
pdf_context = cairo.Context(pdf_surface)
|
||||
|
||||
for pagenum in range(pages_count):
|
||||
@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
|
||||
pdf_surface.finish()
|
||||
|
||||
# Removes metadata added by Poppler
|
||||
document = Poppler.Document.new_from_file('file://' + tmp_path)
|
||||
document.set_producer('')
|
||||
document.set_creator('')
|
||||
document.save('file://' + os.path.abspath(self.output_filename))
|
||||
self.__remove_superficial_meta(tmp_path, self.output_filename)
|
||||
os.remove(tmp_path)
|
||||
|
||||
return True
|
||||
|
||||
def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
|
||||
document = Poppler.Document.new_from_file('file://' + in_file)
|
||||
document.set_producer('')
|
||||
document.set_creator('')
|
||||
document.save('file://' + os.path.abspath(out_file))
|
||||
return True
|
||||
|
||||
|
||||
def __parse_metadata_field(self, data:str) -> dict:
|
||||
metadata = {}
|
||||
|
@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
|
||||
def test_help(self):
|
||||
proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
|
||||
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
|
||||
|
||||
def test_no_arg(self):
|
||||
proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
|
||||
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
|
||||
|
||||
|
||||
class TestGetMeta(unittest.TestCase):
|
||||
|
@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
|
||||
|
||||
os.remove('./tests/data/clean.odt')
|
||||
|
||||
class TestLightWeightCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
ret = p.remove_all_lightweight()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
ret = p.remove_all_lightweight()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.png.cleaned')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
|
||||
class TestCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
|
Loading…
Reference in New Issue
Block a user