diff --git a/main.py b/main.py index e4157e6..4b965b4 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ from shutil import copyfile import argparse from src.parsers import pdf +from src import parser_factory def create_arg_parser(): @@ -19,7 +20,7 @@ def create_arg_parser(): return parser def show_meta(file_name:str): - p = pdf.PDFParser(file_name) + p = parser_factory(file_name) for k,v in p.get_meta().items(): print("%s: %s" % (k, v)) @@ -32,10 +33,10 @@ def main(): show_meta(f) return 0 elif not args.files: - return parser.show_help() + return argparser.show_help() - copyfile(sys.argv[1] + '.bak', sys.argv[1]) - p = pdf.PDFParser(sys.argv[1]) + #p = pdf.PDFParser(sys.argv[1]) + p = parser_factory.get_parser(sys.argv[1]) p.remove_all() p = pdf.PDFParser('OUT_clean.pdf') print("ok") diff --git a/src/parser_factory.py b/src/parser_factory.py new file mode 100644 index 0000000..a93595a --- /dev/null +++ b/src/parser_factory.py @@ -0,0 +1,10 @@ +import mimetypes + +from .parsers import abstract +from .parsers import * + +def get_parser(filename: str): + mtype, _ = mimetypes.guess_type(filename) + for c in abstract.AbstractParser.__subclasses__(): + if mtype in c.mimetypes: + return c(filename) diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py index d0e7108..80bb812 100644 --- a/src/parsers/abstract.py +++ b/src/parsers/abstract.py @@ -3,6 +3,7 @@ class AbstractParser(object): self.filename = filename self.output_filename = filename + '.cleaned' self.meta_list = set() + self.mimetypes = set() def get_meta(self): raise NotImplementedError diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index 26985c6..e7bd00d 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py @@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG) class PDFParser(abstract.AbstractParser): + mimetypes = {'application/pdf', } + meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} + def __init__(self, filename): super().__init__(filename) - self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', - 'metadata', 'mod-date', 'producer', 'subject', 'title', - 'viewer-preferences'} self.uri = 'file://' + os.path.abspath(self.filename) - self.password = None def remove_all(self): """ @@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser): PDF are removed via Poppler, because there is no way to tell cairo to not add "created by cairo" during rendering. """ - document = Poppler.Document.new_from_file(self.uri, self.password) + document = Poppler.Document.new_from_file(self.uri, None) pages_count = document.get_n_pages() _, tmp_path = tempfile.mkstemp() @@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser): """ Return a dict with all the meta of the file """ print("URI: %s", self.uri) - document = Poppler.Document.new_from_file(self.uri, self.password) + document = Poppler.Document.new_from_file(self.uri, None) metadata = {} for key in self.meta_list: if document.get_property(key):