Implement mimetype detection
This commit is contained in:
parent
d262f780f7
commit
8f44616366
9
main.py
9
main.py
@ -3,6 +3,7 @@ from shutil import copyfile
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from src.parsers import pdf
|
from src.parsers import pdf
|
||||||
|
from src import parser_factory
|
||||||
|
|
||||||
|
|
||||||
def create_arg_parser():
|
def create_arg_parser():
|
||||||
@ -19,7 +20,7 @@ def create_arg_parser():
|
|||||||
return parser
|
return parser
|
||||||
|
|
||||||
def show_meta(file_name:str):
|
def show_meta(file_name:str):
|
||||||
p = pdf.PDFParser(file_name)
|
p = parser_factory(file_name)
|
||||||
for k,v in p.get_meta().items():
|
for k,v in p.get_meta().items():
|
||||||
print("%s: %s" % (k, v))
|
print("%s: %s" % (k, v))
|
||||||
|
|
||||||
@ -32,10 +33,10 @@ def main():
|
|||||||
show_meta(f)
|
show_meta(f)
|
||||||
return 0
|
return 0
|
||||||
elif not args.files:
|
elif not args.files:
|
||||||
return parser.show_help()
|
return argparser.show_help()
|
||||||
|
|
||||||
copyfile(sys.argv[1] + '.bak', sys.argv[1])
|
#p = pdf.PDFParser(sys.argv[1])
|
||||||
p = pdf.PDFParser(sys.argv[1])
|
p = parser_factory.get_parser(sys.argv[1])
|
||||||
p.remove_all()
|
p.remove_all()
|
||||||
p = pdf.PDFParser('OUT_clean.pdf')
|
p = pdf.PDFParser('OUT_clean.pdf')
|
||||||
print("ok")
|
print("ok")
|
||||||
|
10
src/parser_factory.py
Normal file
10
src/parser_factory.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import mimetypes
|
||||||
|
|
||||||
|
from .parsers import abstract
|
||||||
|
from .parsers import *
|
||||||
|
|
||||||
|
def get_parser(filename: str):
|
||||||
|
mtype, _ = mimetypes.guess_type(filename)
|
||||||
|
for c in abstract.AbstractParser.__subclasses__():
|
||||||
|
if mtype in c.mimetypes:
|
||||||
|
return c(filename)
|
@ -3,6 +3,7 @@ class AbstractParser(object):
|
|||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.output_filename = filename + '.cleaned'
|
self.output_filename = filename + '.cleaned'
|
||||||
self.meta_list = set()
|
self.meta_list = set()
|
||||||
|
self.mimetypes = set()
|
||||||
|
|
||||||
def get_meta(self):
|
def get_meta(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
|
|||||||
|
|
||||||
|
|
||||||
class PDFParser(abstract.AbstractParser):
|
class PDFParser(abstract.AbstractParser):
|
||||||
def __init__(self, filename):
|
mimetypes = {'application/pdf', }
|
||||||
super().__init__(filename)
|
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
|
||||||
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
|
|
||||||
'metadata', 'mod-date', 'producer', 'subject', 'title',
|
'metadata', 'mod-date', 'producer', 'subject', 'title',
|
||||||
'viewer-preferences'}
|
'viewer-preferences'}
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
super().__init__(filename)
|
||||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||||
self.password = None
|
|
||||||
|
|
||||||
def remove_all(self):
|
def remove_all(self):
|
||||||
"""
|
"""
|
||||||
@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
PDF are removed via Poppler, because there is no way to tell
|
PDF are removed via Poppler, because there is no way to tell
|
||||||
cairo to not add "created by cairo" during rendering.
|
cairo to not add "created by cairo" during rendering.
|
||||||
"""
|
"""
|
||||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
document = Poppler.Document.new_from_file(self.uri, None)
|
||||||
pages_count = document.get_n_pages()
|
pages_count = document.get_n_pages()
|
||||||
|
|
||||||
_, tmp_path = tempfile.mkstemp()
|
_, tmp_path = tempfile.mkstemp()
|
||||||
@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
""" Return a dict with all the meta of the file
|
""" Return a dict with all the meta of the file
|
||||||
"""
|
"""
|
||||||
print("URI: %s", self.uri)
|
print("URI: %s", self.uri)
|
||||||
document = Poppler.Document.new_from_file(self.uri, self.password)
|
document = Poppler.Document.new_from_file(self.uri, None)
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for key in self.meta_list:
|
for key in self.meta_list:
|
||||||
if document.get_property(key):
|
if document.get_property(key):
|
||||||
|
Loading…
Reference in New Issue
Block a user