1
0
mirror of synced 2024-11-22 01:04:23 +01:00

Implement mimetype detection

This commit is contained in:
jvoisin 2018-03-19 23:43:49 +01:00
parent d262f780f7
commit 8f44616366
4 changed files with 23 additions and 10 deletions

View File

@ -3,6 +3,7 @@ from shutil import copyfile
import argparse
from src.parsers import pdf
from src import parser_factory
def create_arg_parser():
@ -19,7 +20,7 @@ def create_arg_parser():
return parser
def show_meta(file_name:str):
p = pdf.PDFParser(file_name)
p = parser_factory(file_name)
for k,v in p.get_meta().items():
print("%s: %s" % (k, v))
@ -32,10 +33,10 @@ def main():
show_meta(f)
return 0
elif not args.files:
return parser.show_help()
return argparser.show_help()
copyfile(sys.argv[1] + '.bak', sys.argv[1])
p = pdf.PDFParser(sys.argv[1])
#p = pdf.PDFParser(sys.argv[1])
p = parser_factory.get_parser(sys.argv[1])
p.remove_all()
p = pdf.PDFParser('OUT_clean.pdf')
print("ok")

10
src/parser_factory.py Normal file
View File

@ -0,0 +1,10 @@
import mimetypes
from .parsers import abstract
from .parsers import *
def get_parser(filename: str):
mtype, _ = mimetypes.guess_type(filename)
for c in abstract.AbstractParser.__subclasses__():
if mtype in c.mimetypes:
return c(filename)

View File

@ -3,6 +3,7 @@ class AbstractParser(object):
self.filename = filename
self.output_filename = filename + '.cleaned'
self.meta_list = set()
self.mimetypes = set()
def get_meta(self):
raise NotImplementedError

View File

@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
mimetypes = {'application/pdf', }
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
def __init__(self, filename):
super().__init__(filename)
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self):
"""
@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp()
@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
""" Return a dict with all the meta of the file
"""
print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password)
document = Poppler.Document.new_from_file(self.uri, None)
metadata = {}
for key in self.meta_list:
if document.get_property(key):