1
0
mirror of synced 2024-11-22 09:14:23 +01:00

Implement mimetype detection

This commit is contained in:
jvoisin 2018-03-19 23:43:49 +01:00
parent d262f780f7
commit 8f44616366
4 changed files with 23 additions and 10 deletions

View File

@ -3,6 +3,7 @@ from shutil import copyfile
import argparse import argparse
from src.parsers import pdf from src.parsers import pdf
from src import parser_factory
def create_arg_parser(): def create_arg_parser():
@ -19,7 +20,7 @@ def create_arg_parser():
return parser return parser
def show_meta(file_name:str): def show_meta(file_name:str):
p = pdf.PDFParser(file_name) p = parser_factory(file_name)
for k,v in p.get_meta().items(): for k,v in p.get_meta().items():
print("%s: %s" % (k, v)) print("%s: %s" % (k, v))
@ -32,10 +33,10 @@ def main():
show_meta(f) show_meta(f)
return 0 return 0
elif not args.files: elif not args.files:
return parser.show_help() return argparser.show_help()
copyfile(sys.argv[1] + '.bak', sys.argv[1]) #p = pdf.PDFParser(sys.argv[1])
p = pdf.PDFParser(sys.argv[1]) p = parser_factory.get_parser(sys.argv[1])
p.remove_all() p.remove_all()
p = pdf.PDFParser('OUT_clean.pdf') p = pdf.PDFParser('OUT_clean.pdf')
print("ok") print("ok")

10
src/parser_factory.py Normal file
View File

@ -0,0 +1,10 @@
import mimetypes
from .parsers import abstract
from .parsers import *
def get_parser(filename: str):
mtype, _ = mimetypes.guess_type(filename)
for c in abstract.AbstractParser.__subclasses__():
if mtype in c.mimetypes:
return c(filename)

View File

@ -3,6 +3,7 @@ class AbstractParser(object):
self.filename = filename self.filename = filename
self.output_filename = filename + '.cleaned' self.output_filename = filename + '.cleaned'
self.meta_list = set() self.meta_list = set()
self.mimetypes = set()
def get_meta(self): def get_meta(self):
raise NotImplementedError raise NotImplementedError

View File

@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser): class PDFParser(abstract.AbstractParser):
def __init__(self, filename): mimetypes = {'application/pdf', }
super().__init__(filename) meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title', 'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'} 'viewer-preferences'}
def __init__(self, filename):
super().__init__(filename)
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self): def remove_all(self):
""" """
@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
PDF are removed via Poppler, because there is no way to tell PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering. cairo to not add "created by cairo" during rendering.
""" """
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages() pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp() _, tmp_path = tempfile.mkstemp()
@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
print("URI: %s", self.uri) print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, None)
metadata = {} metadata = {}
for key in self.meta_list: for key in self.meta_list:
if document.get_property(key): if document.get_property(key):