Add lightweight processing for PDF

2025-06-20 13:47:55 +02:00 · 2018-04-14 21:23:31 +02:00 · 2018-04-14 21:23:31 +02:00 · 96299c6a53
commit 96299c6a53
parent 6f4ed2490f
5 changed files with 84 additions and 15 deletions
--- a/main.py
+++ b/main.py
@ -31,6 +31,8 @@ def create_arg_parser():
                      help='list all supported fileformats')
    info.add_argument('-s', '--show', action='store_true',
                      help='list all the harmful metadata of a file without removing them')
+    info.add_argument('-L', '--lightweight', action='store_true',
+                      help='remove SOME metadata')
    return parser


@ -50,7 +52,7 @@ def show_meta(filename:str):
            print("  %s: harmful content" % k)


-def clean_meta(filename:str):
+def clean_meta(filename:str, is_lightweigth:bool):
    if not __check_file(filename, os.R_OK|os.W_OK):
        return

@ -58,7 +60,10 @@ def clean_meta(filename:str):
    if p is None:
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
        return
-    p.remove_all()
+    if is_lightweigth:
+        p.remove_all_lightweight()
+    else:
+        p.remove_all()


 def show_parsers():
@ -78,12 +83,12 @@ def __get_files_recursively(files):
                for _f in _files:
                    yield os.path.join(path, _f)

-def __do_clean_async(q):
+def __do_clean_async(is_lightweigth, q):
    while True:
        f = q.get()
        if f is None:  # nothing more to process
            return
-        clean_meta(f)
+        clean_meta(is_lightweigth, f)
        q.task_done()


@ -109,7 +114,7 @@ def main():
            q.put(f)

        for _ in range(multiprocessing.cpu_count()):
-            worker = Thread(target=__do_clean_async, args=(q, ))
+            worker = Thread(target=__do_clean_async, args=(mode, q))
            worker.start()
            threads.append(worker)

--- a/src/abstract.py
+++ b/src/abstract.py
@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
    @abc.abstractmethod
    def remove_all(self) -> bool:
        pass
+
+    def remove_all_lightweight(self) -> bool:
+        """ Remove _SOME_ metadata. """
+        return self.remove_all()
--- a/src/pdf.py
+++ b/src/pdf.py
@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
        self.uri = 'file://' + os.path.abspath(self.filename)
        self.__scale = 2  # how much precision do we want for the render

+    def remove_all_lightweight(self):
+        """
+            Load the document into Poppler, render pages on a new PDFSurface.
+        """
+        document = Poppler.Document.new_from_file(self.uri, None)
+        pages_count = document.get_n_pages()
+
+        tmp_path = tempfile.mkstemp()[1]
+        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
+        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface
+
+        for pagenum in range(pages_count):
+            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
+            page = document.get_page(pagenum)
+            page_width, page_height = page.get_size()
+            pdf_surface.set_size(page_width, page_height)
+            pdf_context.save()
+            page.render_for_printing(pdf_context)
+            pdf_context.restore()
+            pdf_context.show_page()  # draw pdf_context on pdf_surface
+        pdf_surface.finish()
+
+        self.__remove_superficial_meta(tmp_path, self.output_filename)
+        os.remove(tmp_path)
+
+        return True
+
    def remove_all(self):
        """
            Load the document into Poppler, render pages on PNG,
-            and shove those PNG into a new PDF. Metadata from the new
-            PDF are removed via Poppler, because there is no way to tell
-            cairo to not add "created by cairo" during rendering.
+            and shove those PNG into a new PDF.
        """
        document = Poppler.Document.new_from_file(self.uri, None)
        pages_count = document.get_n_pages()

        _, tmp_path = tempfile.mkstemp()
-        pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
+        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
        pdf_context = cairo.Context(pdf_surface)

        for pagenum in range(pages_count):
@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
        pdf_surface.finish()

        # Removes metadata added by Poppler
-        document = Poppler.Document.new_from_file('file://' + tmp_path)
-        document.set_producer('')
-        document.set_creator('')
-        document.save('file://' + os.path.abspath(self.output_filename))
+        self.__remove_superficial_meta(tmp_path, self.output_filename)
        os.remove(tmp_path)

        return True

+    def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
+        document = Poppler.Document.new_from_file('file://' + in_file)
+        document.set_producer('')
+        document.set_creator('')
+        document.save('file://' + os.path.abspath(out_file))
+        return True
+

    def __parse_metadata_field(self, data:str) -> dict:
        metadata = {}
--- a/tests/test_climat2.py
+++ b/tests/test_climat2.py
@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
    def test_help(self):
        proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
        stdout, _ = proc.communicate()
-        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
+        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)

    def test_no_arg(self):
        proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
        stdout, _ = proc.communicate()
-        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
+        self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)


 class TestGetMeta(unittest.TestCase):
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):

        os.remove('./tests/data/clean.odt')

+class TestLightWeightCleaning(unittest.TestCase):
+    def test_pdf(self):
+        shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
+        p = pdf.PDFParser('./tests/data/clean.pdf')
+
+        meta = p.get_meta()
+        self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
+
+        ret = p.remove_all_lightweight()
+        self.assertTrue(ret)
+
+        p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
+        expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
+        self.assertEqual(p.get_meta(), expected_meta)
+
+        os.remove('./tests/data/clean.pdf')
+
+    def test_png(self):
+        shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
+        p = images.PNGParser('./tests/data/clean.png')
+
+        meta = p.get_meta()
+        self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
+
+        ret = p.remove_all_lightweight()
+        self.assertTrue(ret)
+
+        p = images.PNGParser('./tests/data/clean.png.cleaned')
+        self.assertEqual(p.get_meta(), {})
+
+        os.remove('./tests/data/clean.png')

 class TestCleaning(unittest.TestCase):
    def test_pdf(self):