Improve the code's documentation
This commit is contained in:
parent
565cb66d14
commit
942859601d
@ -6,10 +6,16 @@ assert Set # make pyflakes happy
|
|||||||
|
|
||||||
|
|
||||||
class AbstractParser(abc.ABC):
|
class AbstractParser(abc.ABC):
|
||||||
|
""" This is the base classe of every parser.
|
||||||
|
It might yeild `ValueError` on instanciation on invalid files.
|
||||||
|
"""
|
||||||
meta_list = set() # type: Set[str]
|
meta_list = set() # type: Set[str]
|
||||||
mimetypes = set() # type: Set[str]
|
mimetypes = set() # type: Set[str]
|
||||||
|
|
||||||
def __init__(self, filename: str) -> None:
|
def __init__(self, filename: str) -> None:
|
||||||
|
"""
|
||||||
|
:raises ValueError: Raised upon an invalid file
|
||||||
|
"""
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
fname, extension = os.path.splitext(filename)
|
fname, extension = os.path.splitext(filename)
|
||||||
self.output_filename = fname + '.cleaned' + extension
|
self.output_filename = fname + '.cleaned' + extension
|
||||||
@ -23,5 +29,8 @@ class AbstractParser(abc.ABC):
|
|||||||
pass # pragma: no cover
|
pass # pragma: no cover
|
||||||
|
|
||||||
def remove_all_lightweight(self) -> bool:
|
def remove_all_lightweight(self) -> bool:
|
||||||
""" Remove _SOME_ metadata. """
|
""" This method removes _SOME_ metadata.
|
||||||
|
I might be useful to implement it for fileformats that do
|
||||||
|
not support non-destructive cleaning.
|
||||||
|
"""
|
||||||
return self.remove_all()
|
return self.remove_all()
|
||||||
|
@ -4,7 +4,7 @@ from . import abstract
|
|||||||
|
|
||||||
|
|
||||||
class HarmlessParser(abstract.AbstractParser):
|
class HarmlessParser(abstract.AbstractParser):
|
||||||
""" This is the parser for filetypes that do not contain metadata. """
|
""" This is the parser for filetypes that can not contain metadata. """
|
||||||
mimetypes = {'text/plain', 'image/x-ms-bmp'}
|
mimetypes = {'text/plain', 'image/x-ms-bmp'}
|
||||||
|
|
||||||
def get_meta(self) -> Dict[str, str]:
|
def get_meta(self) -> Dict[str, str]:
|
||||||
|
@ -19,6 +19,9 @@ from . import abstract
|
|||||||
assert Set
|
assert Set
|
||||||
|
|
||||||
class _ImageParser(abstract.AbstractParser):
|
class _ImageParser(abstract.AbstractParser):
|
||||||
|
""" Since we use `exiftool` to get metadata from
|
||||||
|
all images fileformat, `get_meta` is implemented in this class,
|
||||||
|
and all the image-handling ones are inheriting from it."""
|
||||||
meta_whitelist = set() # type: Set[str]
|
meta_whitelist = set() # type: Set[str]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -72,7 +75,7 @@ class PNGParser(_ImageParser):
|
|||||||
|
|
||||||
class GdkPixbufAbstractParser(_ImageParser):
|
class GdkPixbufAbstractParser(_ImageParser):
|
||||||
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
|
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
|
||||||
this has the side-effect of removing metadata completely.
|
this has the side-effect of completely removing metadata.
|
||||||
"""
|
"""
|
||||||
_type = ''
|
_type = ''
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ def _parse_xml(full_path: str):
|
|||||||
|
|
||||||
|
|
||||||
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||||
|
""" Office files (.docx, .odt, …) are zipped files. """
|
||||||
# Those are the files that have a format that _isn't_
|
# Those are the files that have a format that _isn't_
|
||||||
# supported by MAT2, but that we want to keep anyway.
|
# supported by MAT2, but that we want to keep anyway.
|
||||||
files_to_keep = set() # type: Set[str]
|
files_to_keep = set() # type: Set[str]
|
||||||
@ -58,14 +59,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
|||||||
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
|
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
|
||||||
zipinfo.create_system = 3 # Linux
|
zipinfo.create_system = 3 # Linux
|
||||||
zipinfo.comment = b''
|
zipinfo.comment = b''
|
||||||
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
|
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
|
||||||
return zipinfo
|
return zipinfo
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
|
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
|
||||||
metadata = {}
|
metadata = {}
|
||||||
if zipinfo.create_system == 3:
|
if zipinfo.create_system == 3: # this is Linux
|
||||||
#metadata['create_system'] = 'Linux'
|
|
||||||
pass
|
pass
|
||||||
elif zipinfo.create_system == 2:
|
elif zipinfo.create_system == 2:
|
||||||
metadata['create_system'] = 'Windows'
|
metadata['create_system'] = 'Windows'
|
||||||
@ -145,23 +145,27 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __remove_revisions(full_path: str) -> bool:
|
def __remove_revisions(full_path: str) -> bool:
|
||||||
""" In this function, we're changing the XML
|
""" In this function, we're changing the XML document in several
|
||||||
document in two times, since we don't want
|
different times, since we don't want to change the tree we're currently
|
||||||
to change the tree we're iterating on."""
|
iterating on.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
tree, namespace = _parse_xml(full_path)
|
tree, namespace = _parse_xml(full_path)
|
||||||
except ET.ParseError:
|
except ET.ParseError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# No revisions are present
|
# Revisions are either deletions (`w:del`) or
|
||||||
|
# insertions (`w:ins`)
|
||||||
del_presence = tree.find('.//w:del', namespace)
|
del_presence = tree.find('.//w:del', namespace)
|
||||||
ins_presence = tree.find('.//w:ins', namespace)
|
ins_presence = tree.find('.//w:ins', namespace)
|
||||||
if del_presence is None and ins_presence is None:
|
if del_presence is None and ins_presence is None:
|
||||||
return True
|
return True # No revisions are present
|
||||||
|
|
||||||
parent_map = {c:p for p in tree.iter() for c in p}
|
parent_map = {c:p for p in tree.iter() for c in p}
|
||||||
|
|
||||||
elements = list([element for element in tree.iterfind('.//w:del', namespace)])
|
elements = list()
|
||||||
|
for element in tree.iterfind('.//w:del', namespace):
|
||||||
|
elements.append(element)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
parent_map[element].remove(element)
|
parent_map[element].remove(element)
|
||||||
|
|
||||||
@ -172,7 +176,6 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
for children in element.iterfind('./*'):
|
for children in element.iterfind('./*'):
|
||||||
elements.append((element, position, children))
|
elements.append((element, position, children))
|
||||||
break
|
break
|
||||||
|
|
||||||
for (element, position, children) in elements:
|
for (element, position, children) in elements:
|
||||||
parent_map[element].insert(position, children)
|
parent_map[element].insert(position, children)
|
||||||
parent_map[element].remove(element)
|
parent_map[element].remove(element)
|
||||||
@ -183,6 +186,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
|||||||
|
|
||||||
def _specific_cleanup(self, full_path: str) -> bool:
|
def _specific_cleanup(self, full_path: str) -> bool:
|
||||||
if full_path.endswith('/word/document.xml'):
|
if full_path.endswith('/word/document.xml'):
|
||||||
|
# this file contains the revisions
|
||||||
return self.__remove_revisions(full_path)
|
return self.__remove_revisions(full_path)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -13,10 +13,12 @@ T = TypeVar('T', bound='abstract.AbstractParser')
|
|||||||
def __load_all_parsers():
|
def __load_all_parsers():
|
||||||
""" Loads every parser in a dynamic way """
|
""" Loads every parser in a dynamic way """
|
||||||
current_dir = os.path.dirname(__file__)
|
current_dir = os.path.dirname(__file__)
|
||||||
for name in glob.glob(os.path.join(current_dir, '*.py')):
|
for fname in glob.glob(os.path.join(current_dir, '*.py')):
|
||||||
if name.endswith('abstract.py') or name.endswith('__init__.py'):
|
if fname.endswith('abstract.py'):
|
||||||
continue
|
continue
|
||||||
basename = os.path.basename(name)
|
elif fname.endswith('__init__.py'):
|
||||||
|
continue
|
||||||
|
basename = os.path.basename(fname)
|
||||||
name, _ = os.path.splitext(basename)
|
name, _ = os.path.splitext(basename)
|
||||||
importlib.import_module('.' + name, package='libmat2')
|
importlib.import_module('.' + name, package='libmat2')
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
pages_count = document.get_n_pages()
|
pages_count = document.get_n_pages()
|
||||||
|
|
||||||
tmp_path = tempfile.mkstemp()[1]
|
tmp_path = tempfile.mkstemp()[1]
|
||||||
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
|
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
|
||||||
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
|
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
|
||||||
|
|
||||||
for pagenum in range(pages_count):
|
for pagenum in range(pages_count):
|
||||||
@ -101,7 +101,7 @@ class PDFParser(abstract.AbstractParser):
|
|||||||
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
|
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
|
||||||
pdf_context.set_source_surface(img, 0, 0)
|
pdf_context.set_source_surface(img, 0, 0)
|
||||||
pdf_context.paint()
|
pdf_context.paint()
|
||||||
pdf_context.show_page()
|
pdf_context.show_page() # draw pdf_context on pdf_surface
|
||||||
|
|
||||||
pdf_surface.finish()
|
pdf_surface.finish()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user