mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-12-04 15:05:52 +01:00
WIP of "AdBlock for Science"
The purpose of adblock for science is to remove nasty ads from papers, which at the moment means only papers from Science Magazine as published by the American Association for the Advancement of Science (AAAS). I am really annoyed that I have to write an ad blocker... for science papers.
This commit is contained in:
parent
71aaf23285
commit
cc7d14d173
@ -11,4 +11,4 @@ from .aip import *
|
||||
from .ieee import *
|
||||
from .jstor import *
|
||||
from .rsc import *
|
||||
|
||||
from .sciencemagazine import *
|
||||
|
50
pdfparanoia/plugins/sciencemagazine.py
Normal file
50
pdfparanoia/plugins/sciencemagazine.py
Normal file
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from copy import copy
|
||||
import sys
|
||||
|
||||
from ..parser import parse_content
|
||||
from ..eraser import remove_object_by_id
|
||||
from ..plugin import Plugin
|
||||
|
||||
class ScienceMagazine(Plugin):
|
||||
"""
|
||||
Science Magazine
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Remove ads from academic papers. :(
|
||||
"""
|
||||
|
||||
# TODO: better confirmation that the paper is from sciencemag. Look for
|
||||
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
|
||||
# that server.
|
||||
|
||||
@classmethod
|
||||
def scrub(cls, content, verbose=0):
|
||||
evil_ids = []
|
||||
|
||||
# parse the pdf into a pdfminer document
|
||||
pdf = parse_content(content)
|
||||
|
||||
# get a list of all object ids
|
||||
xrefs = pdf._parser.read_xref()
|
||||
xref = xrefs[0]
|
||||
objids = xref.get_objids()
|
||||
|
||||
# check each object in the pdf
|
||||
for objid in objids:
|
||||
# get an object by id
|
||||
obj = pdf.getobj(objid)
|
||||
|
||||
if hasattr(obj, "attrs"):
|
||||
if "Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
|
||||
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
|
||||
evil_ids.append(objid)
|
||||
|
||||
if len(evil_ids) > 1:
|
||||
raise Exception("too many ads detected on the page, please double check?")
|
||||
|
||||
for objid in evil_ids:
|
||||
content = remove_object_by_id(content, objid)
|
||||
|
||||
return content
|
Loading…
Reference in New Issue
Block a user