diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 2717682..1a15413 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -11,4 +11,4 @@ from .aip import * from .ieee import * from .jstor import * from .rsc import * - +from .sciencemagazine import * diff --git a/pdfparanoia/plugins/sciencemagazine.py b/pdfparanoia/plugins/sciencemagazine.py new file mode 100644 index 0000000..033fdb3 --- /dev/null +++ b/pdfparanoia/plugins/sciencemagazine.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +from copy import copy +import sys + +from ..parser import parse_content +from ..eraser import remove_object_by_id +from ..plugin import Plugin + +class ScienceMagazine(Plugin): + """ + Science Magazine + ~~~~~~~~~~~~~~~ + + Remove ads from academic papers. :( + """ + + # TODO: better confirmation that the paper is from sciencemag. Look for + # "oascentral" in one of the URIs, since the ads are all hyperlinked to + # that server. + + @classmethod + def scrub(cls, content, verbose=0): + evil_ids = [] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + if "Width" in obj.attrs) and str(obj.attrs["Width"]) == "432": + if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230": + evil_ids.append(objid) + + if len(evil_ids) > 1: + raise Exception("too many ads detected on the page, please double check?") + + for objid in evil_ids: + content = remove_object_by_id(content, objid) + + return content