1
0
mirror of https://github.com/kanzure/pdfparanoia.git synced 2024-12-04 23:15:52 +01:00

WIP of "AdBlock for Science"

The purpose of adblock for science is to remove nasty ads from papers,
which at the moment means only papers from Science Magazine as published
by the American Association for the Advancement of Science (AAAS).

I am really annoyed that I have to write an ad blocker... for science
papers.
This commit is contained in:
Bryan Bishop 2013-07-19 21:27:50 -05:00
parent 71aaf23285
commit cc7d14d173
2 changed files with 51 additions and 1 deletions

View File

@ -11,4 +11,4 @@ from .aip import *
from .ieee import * from .ieee import *
from .jstor import * from .jstor import *
from .rsc import * from .rsc import *
from .sciencemagazine import *

View File

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
from copy import copy
import sys
from ..parser import parse_content
from ..eraser import remove_object_by_id
from ..plugin import Plugin
class ScienceMagazine(Plugin):
"""
Science Magazine
~~~~~~~~~~~~~~~
Remove ads from academic papers. :(
"""
# TODO: better confirmation that the paper is from sciencemag. Look for
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
# that server.
@classmethod
def scrub(cls, content, verbose=0):
evil_ids = []
# parse the pdf into a pdfminer document
pdf = parse_content(content)
# get a list of all object ids
xrefs = pdf._parser.read_xref()
xref = xrefs[0]
objids = xref.get_objids()
# check each object in the pdf
for objid in objids:
# get an object by id
obj = pdf.getobj(objid)
if hasattr(obj, "attrs"):
if "Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
evil_ids.append(objid)
if len(evil_ids) > 1:
raise Exception("too many ads detected on the page, please double check?")
for objid in evil_ids:
content = remove_object_by_id(content, objid)
return content