1
0
Fork 0

Improve the code's documentation

This commit is contained in:
jvoisin 2018-07-19 23:10:27 +02:00
parent 565cb66d14
commit 942859601d
6 changed files with 36 additions and 18 deletions

View File

@ -6,10 +6,16 @@ assert Set # make pyflakes happy
class AbstractParser(abc.ABC):
""" This is the base classe of every parser.
It might yeild `ValueError` on instanciation on invalid files.
"""
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
def __init__(self, filename: str) -> None:
"""
:raises ValueError: Raised upon an invalid file
"""
self.filename = filename
fname, extension = os.path.splitext(filename)
self.output_filename = fname + '.cleaned' + extension
@ -23,5 +29,8 @@ class AbstractParser(abc.ABC):
pass # pragma: no cover
def remove_all_lightweight(self) -> bool:
""" Remove _SOME_ metadata. """
""" This method removes _SOME_ metadata.
I might be useful to implement it for fileformats that do
not support non-destructive cleaning.
"""
return self.remove_all()

View File

@ -4,7 +4,7 @@ from . import abstract
class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that do not contain metadata. """
""" This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'}
def get_meta(self) -> Dict[str, str]:

View File

@ -19,6 +19,9 @@ from . import abstract
assert Set
class _ImageParser(abstract.AbstractParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str]
@staticmethod
@ -72,7 +75,7 @@ class PNGParser(_ImageParser):
class GdkPixbufAbstractParser(_ImageParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of removing metadata completely.
this has the side-effect of completely removing metadata.
"""
_type = ''

View File

@ -33,6 +33,7 @@ def _parse_xml(full_path: str):
class ArchiveBasedAbstractParser(abstract.AbstractParser):
""" Office files (.docx, .odt, …) are zipped files. """
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
files_to_keep = set() # type: Set[str]
@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
return zipinfo
@staticmethod
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
metadata = {}
if zipinfo.create_system == 3:
#metadata['create_system'] = 'Linux'
if zipinfo.create_system == 3: # this is Linux
pass
elif zipinfo.create_system == 2:
metadata['create_system'] = 'Windows'
@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
@staticmethod
def __remove_revisions(full_path: str) -> bool:
""" In this function, we're changing the XML
document in two times, since we don't want
to change the tree we're iterating on."""
""" In this function, we're changing the XML document in several
different times, since we don't want to change the tree we're currently
iterating on.
"""
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError:
return False
# No revisions are present
# Revisions are either deletions (`w:del`) or
# insertions (`w:ins`)
del_presence = tree.find('.//w:del', namespace)
ins_presence = tree.find('.//w:ins', namespace)
if del_presence is None and ins_presence is None:
return True
return True # No revisions are present
parent_map = {c:p for p in tree.iter() for c in p}
elements = list([element for element in tree.iterfind('.//w:del', namespace)])
elements = list()
for element in tree.iterfind('.//w:del', namespace):
elements.append(element)
for element in elements:
parent_map[element].remove(element)
@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for children in element.iterfind('./*'):
elements.append((element, position, children))
break
for (element, position, children) in elements:
parent_map[element].insert(position, children)
parent_map[element].remove(element)
@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
def _specific_cleanup(self, full_path: str) -> bool:
if full_path.endswith('/word/document.xml'):
# this file contains the revisions
return self.__remove_revisions(full_path)
return True

View File

@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser')
def __load_all_parsers():
""" Loads every parser in a dynamic way """
current_dir = os.path.dirname(__file__)
for name in glob.glob(os.path.join(current_dir, '*.py')):
if name.endswith('abstract.py') or name.endswith('__init__.py'):
for fname in glob.glob(os.path.join(current_dir, '*.py')):
if fname.endswith('abstract.py'):
continue
basename = os.path.basename(name)
elif fname.endswith('__init__.py'):
continue
basename = os.path.basename(fname)
name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2')

View File

@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser):
pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count):
@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint()
pdf_context.show_page()
pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish()