From cc7d14d173be9b4a79adb97fba092914255a92f4 Mon Sep 17 00:00:00 2001 From: Bryan Bishop Date: Fri, 19 Jul 2013 21:27:50 -0500 Subject: [PATCH] WIP of "AdBlock for Science" The purpose of adblock for science is to remove nasty ads from papers, which at the moment means only papers from Science Magazine as published by the American Association for the Advancement of Science (AAAS). I am really annoyed that I have to write an ad blocker... for science papers. --- pdfparanoia/plugins/__init__.py | 2 +- pdfparanoia/plugins/sciencemagazine.py | 50 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 pdfparanoia/plugins/sciencemagazine.py diff --git a/pdfparanoia/plugins/__init__.py b/pdfparanoia/plugins/__init__.py index 2717682..1a15413 100644 --- a/pdfparanoia/plugins/__init__.py +++ b/pdfparanoia/plugins/__init__.py @@ -11,4 +11,4 @@ from .aip import * from .ieee import * from .jstor import * from .rsc import * - +from .sciencemagazine import * diff --git a/pdfparanoia/plugins/sciencemagazine.py b/pdfparanoia/plugins/sciencemagazine.py new file mode 100644 index 0000000..033fdb3 --- /dev/null +++ b/pdfparanoia/plugins/sciencemagazine.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +from copy import copy +import sys + +from ..parser import parse_content +from ..eraser import remove_object_by_id +from ..plugin import Plugin + +class ScienceMagazine(Plugin): + """ + Science Magazine + ~~~~~~~~~~~~~~~ + + Remove ads from academic papers. :( + """ + + # TODO: better confirmation that the paper is from sciencemag. Look for + # "oascentral" in one of the URIs, since the ads are all hyperlinked to + # that server. + + @classmethod + def scrub(cls, content, verbose=0): + evil_ids = [] + + # parse the pdf into a pdfminer document + pdf = parse_content(content) + + # get a list of all object ids + xrefs = pdf._parser.read_xref() + xref = xrefs[0] + objids = xref.get_objids() + + # check each object in the pdf + for objid in objids: + # get an object by id + obj = pdf.getobj(objid) + + if hasattr(obj, "attrs"): + if "Width" in obj.attrs) and str(obj.attrs["Width"]) == "432": + if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230": + evil_ids.append(objid) + + if len(evil_ids) > 1: + raise Exception("too many ads detected on the page, please double check?") + + for objid in evil_ids: + content = remove_object_by_id(content, objid) + + return content