pdfparanoia/pdfparanoia/plugins/jstor.py

# -*- coding: utf-8 -*-

from copy import copy

from ..parser import parse_content
from ..eraser import (
    remove_object_by_id,
    replace_object_with,
)
from ..plugin import Plugin

class JSTOR(Plugin):
    """
    JSTOR
    ~~~~~~~~~~~~~~~

    JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and
    then also a watermark at the bottom of each page with a timestamp.

    Watermarks removed:
        * "Accessed" timestamp on the front page
        * footer watermarks on each page

    This was primary written for JSTOR pdfs generated by:
         /Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))
    """

    # these terms appear on a page that has been watermarked
    requirements = [
        "All use subject to ",
        "JSTOR Terms and Conditions",
        "This content downloaded  on",
    ]

    @staticmethod
    def scrub(content):
        replacements = []

        # jstor has certain watermarks only on the first page
        page_id = 0

        # parse the pdf into a pdfminer document
        pdf = parse_content(content)

        # get a list of all object ids
        xrefs = pdf._parser.read_xref()
        xref = xrefs[0]
        objids = xref.get_objids()

        # check each object in the pdf
        for objid in objids:
            # get an object by id
            obj = pdf.getobj(objid)

            if hasattr(obj, "attrs"):
                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
                    length = obj.attrs["Length"]
                    rawdata = copy(obj.rawdata)
                    data = copy(obj.get_data())

                    # make sure all of the requirements are in there
                    if all([requirement in data for requirement in JSTOR.requirements]):
                        better_content = data

                        # remove the date
                        startpos = better_content.find("This content downloaded ")
                        endpos = better_content.find(")", startpos)
                        segment = better_content[startpos:endpos]
                        better_content = better_content.replace(segment, "")

                        # it looks like all of the watermarks are at the end?
                        better_content = better_content[:-160]

                        # "Accessed on dd/mm/yyy hh:mm"
                        #
                        # the "Accessed" line is only on the first page
                        #
                        # it's based on /F2
                        #
                        # This would be better if it could be decoded to
                        # actually search for the "Accessed" text.
                        if page_id == 0 and "/F2 11 Tf\n" in better_content:
                            startpos = better_content.rfind("/F2 11 Tf\n")
                            endpos = better_content.find("Tf\n", startpos+5)
                            better_content = better_content[0:startpos] + better_content[endpos:]

                        replacements.append([objid, better_content])

                        page_id += 1

        for deets in replacements:
            objid = deets[0]
            replacement = deets[1]
            content = replace_object_with(content, objid, replacement)

        return content
jstor watermark removal fixes #1 2013-02-07 00:31:19 +01:00			`# -- coding: utf-8 --`

			`from copy import copy`

			`from ..parser import parse_content`
			`from ..eraser import (`
			`remove_object_by_id,`
			`replace_object_with,`
			`)`
			`from ..plugin import Plugin`

			`class JSTOR(Plugin):`
			`"""`
			`JSTOR`
			`~~~~~~~~~~~~~~~`

			`JSTOR watermarks a first page with an "Accessed" date, lots of TC barf, and`
			`then also a watermark at the bottom of each page with a timestamp.`

			`Watermarks removed:`
			`* "Accessed" timestamp on the front page`
			`* footer watermarks on each page`

			`This was primary written for JSTOR pdfs generated by:`
			`/Producer (itext-paulo-155 \(itextpdf.sf.net-lowagie.com\))`
			`"""`

			`# these terms appear on a page that has been watermarked`
			`requirements = [`
			`"All use subject to ",`
			`"JSTOR Terms and Conditions",`
			`"This content downloaded on",`
			`]`

			`@staticmethod`
			`def scrub(content):`
			`replacements = []`

			`# jstor has certain watermarks only on the first page`
			`page_id = 0`

			`# parse the pdf into a pdfminer document`
			`pdf = parse_content(content)`

			`# get a list of all object ids`
			`xrefs = pdf._parser.read_xref()`
			`xref = xrefs[0]`
			`objids = xref.get_objids()`

			`# check each object in the pdf`
			`for objid in objids:`
			`# get an object by id`
			`obj = pdf.getobj(objid)`

			`if hasattr(obj, "attrs"):`
			`if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":`
			`length = obj.attrs["Length"]`
			`rawdata = copy(obj.rawdata)`
			`data = copy(obj.get_data())`

			`# make sure all of the requirements are in there`
			`if all([requirement in data for requirement in JSTOR.requirements]):`
			`better_content = data`

			`# remove the date`
			`startpos = better_content.find("This content downloaded ")`
			`endpos = better_content.find(")", startpos)`
			`segment = better_content[startpos:endpos]`
			`better_content = better_content.replace(segment, "")`

			`# it looks like all of the watermarks are at the end?`
			`better_content = better_content[:-160]`

			`# "Accessed on dd/mm/yyy hh:mm"`
			`#`
			`# the "Accessed" line is only on the first page`
			`#`
			`# it's based on /F2`
			`#`
			`# This would be better if it could be decoded to`
			`# actually search for the "Accessed" text.`
			`if page_id == 0 and "/F2 11 Tf\n" in better_content:`
			`startpos = better_content.rfind("/F2 11 Tf\n")`
			`endpos = better_content.find("Tf\n", startpos+5)`
			`better_content = better_content[0:startpos] + better_content[endpos:]`

			`replacements.append([objid, better_content])`

			`page_id += 1`

			`for deets in replacements:`
			`objid = deets[0]`
			`replacement = deets[1]`
			`content = replace_object_with(content, objid, replacement)`

			`return content`