pdfparanoia/pdfparanoia/parser.py

# -*- coding: utf-8 -*-
"""
pdfparanoia.parser
~~~~~~~~~~~~~~~

Deals with the existential nature of parsing pdfs.

"""

try:
    from StringIO import StringIO
except ImportError: # py3k
    from io import StringIO, BytesIO

# Maybe one day pdfquery will be able to save pdf.
# from pdfquery import PDFQuery

import pdfminer.pdfparser
import pdfminer.pdfdocument

from .eraser import replace_object_with

def parse_pdf(handler):
    """
    Parses a PDF via pdfminer.
    """
    # reset to the beginning of the data
    handler.seek(0)

    # setup for parsing
    parser = pdfminer.pdfparser.PDFParser(handler)
    doc = pdfminer.pdfdocument.PDFDocument(parser)

    # actual parsing
    doc.initialize()

    return doc

def parse_content(content):
    """
    Parses a PDF via pdfminer from a string. There are some problems with
    pdfminer accepting StringIO objects, so this is a temporary hack.
    """
    stream = StringIO(content)
    return parse_pdf(stream)

def deflate(content):
    """
    Converts all FlateDecode streams into plaintext streams. This significantly
    increases the size of a pdf, but it's useful for debugging and searching
    for how watermarks are implemented.

    Not all elements are preserved in the resulting document. This is for
    debugging only.
    """
    # parse the pdf
    pdf = parse_content(content)

    # get a list of all object ids
    xref = pdf.xrefs[0]
    objids = xref.get_objids()

    # store new replacements
    replacements = []

    # scan through each object looking for things to deflate
    for objid in objids:
        obj = pdf.getobj(objid)
        if hasattr(obj, "attrs"):
            if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                obj.decode()
                data = obj.data
                if len(data) < 1000:
                    replacements.append([objid, data])

    # apply the replacements to the document
    for (objid, replacement) in replacements:
        content = replace_object_with(content, objid, replacement)

    return content