1
0
mirror of synced 2024-11-22 01:04:23 +01:00

Improve the code's documentation

This commit is contained in:
jvoisin 2018-07-19 23:10:27 +02:00
parent 565cb66d14
commit 942859601d
6 changed files with 36 additions and 18 deletions

View File

@ -6,10 +6,16 @@ assert Set # make pyflakes happy
class AbstractParser(abc.ABC): class AbstractParser(abc.ABC):
""" This is the base classe of every parser.
It might yeild `ValueError` on instanciation on invalid files.
"""
meta_list = set() # type: Set[str] meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str] mimetypes = set() # type: Set[str]
def __init__(self, filename: str) -> None: def __init__(self, filename: str) -> None:
"""
:raises ValueError: Raised upon an invalid file
"""
self.filename = filename self.filename = filename
fname, extension = os.path.splitext(filename) fname, extension = os.path.splitext(filename)
self.output_filename = fname + '.cleaned' + extension self.output_filename = fname + '.cleaned' + extension
@ -23,5 +29,8 @@ class AbstractParser(abc.ABC):
pass # pragma: no cover pass # pragma: no cover
def remove_all_lightweight(self) -> bool: def remove_all_lightweight(self) -> bool:
""" Remove _SOME_ metadata. """ """ This method removes _SOME_ metadata.
I might be useful to implement it for fileformats that do
not support non-destructive cleaning.
"""
return self.remove_all() return self.remove_all()

View File

@ -4,7 +4,7 @@ from . import abstract
class HarmlessParser(abstract.AbstractParser): class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that do not contain metadata. """ """ This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'} mimetypes = {'text/plain', 'image/x-ms-bmp'}
def get_meta(self) -> Dict[str, str]: def get_meta(self) -> Dict[str, str]:

View File

@ -19,6 +19,9 @@ from . import abstract
assert Set assert Set
class _ImageParser(abstract.AbstractParser): class _ImageParser(abstract.AbstractParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str] meta_whitelist = set() # type: Set[str]
@staticmethod @staticmethod
@ -72,7 +75,7 @@ class PNGParser(_ImageParser):
class GdkPixbufAbstractParser(_ImageParser): class GdkPixbufAbstractParser(_ImageParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it, """ GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of removing metadata completely. this has the side-effect of completely removing metadata.
""" """
_type = '' _type = ''

View File

@ -33,6 +33,7 @@ def _parse_xml(full_path: str):
class ArchiveBasedAbstractParser(abstract.AbstractParser): class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """
# Those are the files that have a format that _isn't_ # Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway. # supported by MAT2, but that we want to keep anyway.
files_to_keep = set() # type: Set[str] files_to_keep = set() # type: Set[str]
@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux zipinfo.create_system = 3 # Linux
zipinfo.comment = b'' zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
return zipinfo return zipinfo
@staticmethod @staticmethod
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
metadata = {} metadata = {}
if zipinfo.create_system == 3: if zipinfo.create_system == 3: # this is Linux
#metadata['create_system'] = 'Linux'
pass pass
elif zipinfo.create_system == 2: elif zipinfo.create_system == 2:
metadata['create_system'] = 'Windows' metadata['create_system'] = 'Windows'
@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
@staticmethod @staticmethod
def __remove_revisions(full_path: str) -> bool: def __remove_revisions(full_path: str) -> bool:
""" In this function, we're changing the XML """ In this function, we're changing the XML document in several
document in two times, since we don't want different times, since we don't want to change the tree we're currently
to change the tree we're iterating on.""" iterating on.
"""
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError: except ET.ParseError:
return False return False
# No revisions are present # Revisions are either deletions (`w:del`) or
# insertions (`w:ins`)
del_presence = tree.find('.//w:del', namespace) del_presence = tree.find('.//w:del', namespace)
ins_presence = tree.find('.//w:ins', namespace) ins_presence = tree.find('.//w:ins', namespace)
if del_presence is None and ins_presence is None: if del_presence is None and ins_presence is None:
return True return True # No revisions are present
parent_map = {c:p for p in tree.iter() for c in p} parent_map = {c:p for p in tree.iter() for c in p}
elements = list([element for element in tree.iterfind('.//w:del', namespace)]) elements = list()
for element in tree.iterfind('.//w:del', namespace):
elements.append(element)
for element in elements: for element in elements:
parent_map[element].remove(element) parent_map[element].remove(element)
@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for children in element.iterfind('./*'): for children in element.iterfind('./*'):
elements.append((element, position, children)) elements.append((element, position, children))
break break
for (element, position, children) in elements: for (element, position, children) in elements:
parent_map[element].insert(position, children) parent_map[element].insert(position, children)
parent_map[element].remove(element) parent_map[element].remove(element)
@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
if full_path.endswith('/word/document.xml'): if full_path.endswith('/word/document.xml'):
# this file contains the revisions
return self.__remove_revisions(full_path) return self.__remove_revisions(full_path)
return True return True

View File

@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser')
def __load_all_parsers(): def __load_all_parsers():
""" Loads every parser in a dynamic way """ """ Loads every parser in a dynamic way """
current_dir = os.path.dirname(__file__) current_dir = os.path.dirname(__file__)
for name in glob.glob(os.path.join(current_dir, '*.py')): for fname in glob.glob(os.path.join(current_dir, '*.py')):
if name.endswith('abstract.py') or name.endswith('__init__.py'): if fname.endswith('abstract.py'):
continue continue
basename = os.path.basename(name) elif fname.endswith('__init__.py'):
continue
basename = os.path.basename(fname)
name, _ = os.path.splitext(basename) name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2') importlib.import_module('.' + name, package='libmat2')

View File

@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser):
pages_count = document.get_n_pages() pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1] tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
pdf_context = cairo.Context(pdf_surface) # context draws on the surface pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count): for pagenum in range(pages_count):
@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
pdf_context.set_source_surface(img, 0, 0) pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint() pdf_context.paint()
pdf_context.show_page() pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish() pdf_surface.finish()