pdfparanoia/pdfparanoia/core.py

# -*- coding: utf-8 -*-
"""
pdfparanoia.core
~~~~~~~~~~~~~~~

This module provides most of the heavy lifting of pdfparanoia.

"""

import sys
import inspect

from .parser import (
    parse_pdf,
    parse_content,
)

from .plugin import Plugin

from pdfparanoia.plugins import *

def find_plugins():
    """
    Returns a list of all compatible plugins.
    """
    def inspection(thing):
        iswanted = inspect.isclass(thing)
        iswanted = iswanted and issubclass(thing, Plugin)
        iswanted = iswanted and thing is not Plugin
        return iswanted
    plugins = inspect.getmembers(sys.modules[__name__], inspection)
    plugins = [each[1] for each in plugins]
    return plugins

def scrub(obj, verbose=False):
    """
    Removes watermarks from a pdf and returns the resulting pdf as a string.
    """
    # reset the file handler
    if hasattr(obj, "seek"):
        obj.seek(0)
    else:
        obj = open(obj, "rb")

    # load up the raw bytes
    content = obj.read()

    # get a list of plugins that will manipulate this paper
    plugins = find_plugins()

    # clean this pdf as much as possible
    for plugin in plugins:
        content = plugin.scrub(content, verbose=verbose)

    return content