mirror of
https://github.com/kanzure/pdfparanoia.git
synced 2024-05-29 07:08:03 +02:00
Compare commits
26 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
100d202221 | ||
|
c1af289767 | ||
|
26dfe093fe | ||
|
681af5c175 | ||
|
906150e033 | ||
|
585a0ac3a4 | ||
|
5669e4e289 | ||
|
c594ff41d6 | ||
|
1a01757f44 | ||
|
5cc682e2c5 | ||
|
e95374ec04 | ||
|
95a249d8ab | ||
|
380bc289b3 | ||
|
713776af67 | ||
|
e9e0ea4467 | ||
|
61e67d2c4a | ||
|
28bf8f5825 | ||
|
1ff513389f | ||
|
cc7d14d173 | ||
|
71aaf23285 | ||
|
528eae7e46 | ||
|
59a71a7cd3 | ||
|
c3e590f22f | ||
|
f3e4b74b69 | ||
|
6030778089 | ||
|
1070605316 |
6
Makefile
6
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
test:
|
test:
|
||||||
nosetests-2.7 -s --verbosity=2
|
nosetests -s --verbosity=2
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -fr build dist
|
rm -fr build dist
|
||||||
|
@ -10,8 +10,8 @@ clean:
|
||||||
find . -name *.swp -exec rm {} \;
|
find . -name *.swp -exec rm {} \;
|
||||||
|
|
||||||
install:
|
install:
|
||||||
python2.7 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
upload: clean
|
upload: clean
|
||||||
python2.7 setup.py sdist upload
|
python3 setup.py sdist upload
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ publishers include private information like institution names, personal names,
|
||||||
ip addresses, timestamps and other identifying information in watermarks on
|
ip addresses, timestamps and other identifying information in watermarks on
|
||||||
each page.
|
each page.
|
||||||
|
|
||||||
pdfparania это библиотека для удаления водяных знаков из PDF файлов научных
|
pdfparanoia это библиотека для удаления водяных знаков из PDF файлов научных
|
||||||
статей. Некоторые издатели включают личную информацию, такую как названия
|
статей. Некоторые издатели включают личную информацию, такую как названия
|
||||||
институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки
|
институтов, имена, IP-адреса, время и дату и другую информацию в водяные знаки
|
||||||
содержащиеся на каждой странице.
|
содержащиеся на каждой странице.
|
||||||
|
@ -26,6 +26,7 @@ sudo python setup.py install
|
||||||
|
|
||||||
pdfparanoia is written for python2.7+ or python 3.
|
pdfparanoia is written for python2.7+ or python 3.
|
||||||
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
|
You will also need to manually install "pdfminer" if you do not use pip to install pdfparanoia.
|
||||||
|
For python versions prior to Python 3, use "pdfminer" from the Python Package Index (http://pypi.python.org). For recent versions of Python, use pdfminer3k instead.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,11 @@ if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
import pdfparanoia
|
import pdfparanoia
|
||||||
import argparse
|
import argparse
|
||||||
from StringIO import StringIO
|
|
||||||
|
try:
|
||||||
|
from StringIO import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from io import StringIO, BytesIO
|
||||||
|
|
||||||
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
|
ArgP = argparse.ArgumentParser(description="pdfparanoia is a PDF watermark removal library for academic papers. Some publishers include private information like institution names, personal names, ip addresses, timestamps and other identifying information in watermarks on each page.")
|
||||||
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),
|
ArgP.add_argument('in_pdf', nargs='?', type=argparse.FileType('rb'),
|
||||||
|
|
|
@ -1,2 +1,29 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
pdfparanoia - pdf watermark remover library for academic papers
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
pdfparanoia is a pdf watermark remover library for academic papers. Basic
|
||||||
|
usage:
|
||||||
|
|
||||||
|
>>> import pdfparanoia
|
||||||
|
>>> pdf = pdfparanoia.scrub(open("nmat.pdf", "r"))
|
||||||
|
>>> file_handler = open("output.pdf", "w")
|
||||||
|
>>> file_handler.write(pdf)
|
||||||
|
>>> file_handler.close()
|
||||||
|
|
||||||
|
:copyright: (c) 2013 by Bryan Bishop.
|
||||||
|
:license: BSD.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__title__ = "pdfparanoia"
|
||||||
|
__version__ = "0.0.17"
|
||||||
|
__build__ = 0x000017
|
||||||
|
__author__ = "Bryan Bishop <kanzure@gmail.com>"
|
||||||
|
__license__ = "BSD"
|
||||||
|
__copyright__ = "Copyright 2013 Bryan Bishop"
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
from .core import scrub
|
||||||
|
from .parser import deflate
|
||||||
|
|
||||||
|
|
|
@ -7,15 +7,16 @@ Deals with the existential nature of parsing pdfs.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from StringIO import StringIO
|
try:
|
||||||
|
from StringIO import StringIO
|
||||||
|
except ImportError: # py3k
|
||||||
|
from io import StringIO, BytesIO
|
||||||
|
|
||||||
# Maybe one day pdfquery will be able to save pdf.
|
# Maybe one day pdfquery will be able to save pdf.
|
||||||
# from pdfquery import PDFQuery
|
# from pdfquery import PDFQuery
|
||||||
|
|
||||||
from pdfminer.pdfparser import (
|
import pdfminer.pdfparser
|
||||||
PDFParser,
|
import pdfminer.pdfdocument
|
||||||
PDFDocument,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .eraser import replace_object_with
|
from .eraser import replace_object_with
|
||||||
|
|
||||||
|
@ -27,10 +28,8 @@ def parse_pdf(handler):
|
||||||
handler.seek(0)
|
handler.seek(0)
|
||||||
|
|
||||||
# setup for parsing
|
# setup for parsing
|
||||||
parser = PDFParser(handler)
|
parser = pdfminer.pdfparser.PDFParser(handler)
|
||||||
doc = PDFDocument()
|
doc = pdfminer.pdfdocument.PDFDocument(parser)
|
||||||
parser.set_document(doc)
|
|
||||||
doc.set_parser(parser)
|
|
||||||
|
|
||||||
# actual parsing
|
# actual parsing
|
||||||
doc.initialize()
|
doc.initialize()
|
||||||
|
@ -58,8 +57,7 @@ def deflate(content):
|
||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# store new replacements
|
# store new replacements
|
||||||
|
|
|
@ -11,4 +11,4 @@ from .aip import *
|
||||||
from .ieee import *
|
from .ieee import *
|
||||||
from .jstor import *
|
from .jstor import *
|
||||||
from .rsc import *
|
from .rsc import *
|
||||||
|
from .sciencemagazine import *
|
||||||
|
|
|
@ -25,8 +25,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -36,7 +35,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
# watermarks tend to be in FlateDecode elements
|
# watermarks tend to be in FlateDecode elements
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
length = obj.attrs["Length"]
|
length = obj.attrs["Length"]
|
||||||
|
|
||||||
# the watermark is never very long
|
# the watermark is never very long
|
||||||
|
@ -45,7 +44,7 @@ class AmericanInstituteOfPhysics(Plugin):
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
phrase="Redistribution subject to AIP license or copyright"
|
phrase="Redistribution subject to AIP license or copyright"
|
||||||
if phrase in data:
|
if phrase in str(data):
|
||||||
if verbose >= 2:
|
if verbose >= 2:
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data))
|
||||||
elif verbose >= 1:
|
elif verbose >= 1:
|
||||||
|
|
|
@ -22,8 +22,7 @@ class IEEEXplore(Plugin):
|
||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
@ -33,13 +32,13 @@ class IEEEXplore(Plugin):
|
||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
# watermarks tend to be in FlateDecode elements
|
# watermarks tend to be in FlateDecode elements
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
#length = obj.attrs["Length"]
|
#length = obj.attrs["Length"]
|
||||||
#rawdata = copy(obj.rawdata)
|
#rawdata = copy(obj.rawdata)
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
phrase= "Authorized licensed use limited to: "
|
phrase= "Authorized licensed use limited to: "
|
||||||
if phrase in data:
|
if phrase in str(data):
|
||||||
if verbose >= 2:
|
if verbose >= 2:
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
|
||||||
elif verbose >= 1:
|
elif verbose >= 1:
|
||||||
|
|
|
@ -10,6 +10,8 @@ from ..eraser import (
|
||||||
)
|
)
|
||||||
from ..plugin import Plugin
|
from ..plugin import Plugin
|
||||||
|
|
||||||
|
from pdfminer.pdftypes import PDFObjectNotFound
|
||||||
|
|
||||||
class JSTOR(Plugin):
|
class JSTOR(Plugin):
|
||||||
"""
|
"""
|
||||||
JSTOR
|
JSTOR
|
||||||
|
@ -44,55 +46,57 @@ class JSTOR(Plugin):
|
||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
# get an object by id
|
# get an object by id
|
||||||
obj = pdf.getobj(objid)
|
try:
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
if hasattr(obj, "attrs"):
|
if hasattr(obj, "attrs"):
|
||||||
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
|
||||||
data = copy(obj.get_data())
|
data = copy(obj.get_data())
|
||||||
|
|
||||||
# make sure all of the requirements are in there
|
# make sure all of the requirements are in there
|
||||||
if all([requirement in data for requirement in JSTOR.requirements]):
|
if all([requirement in data for requirement in JSTOR.requirements]):
|
||||||
better_content = data
|
better_content = data
|
||||||
|
|
||||||
# remove the date
|
|
||||||
startpos = better_content.find("This content downloaded ")
|
|
||||||
endpos = better_content.find(")", startpos)
|
|
||||||
segment = better_content[startpos:endpos]
|
|
||||||
if verbose >= 2 and replacements:
|
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
|
||||||
|
|
||||||
better_content = better_content.replace(segment, "")
|
|
||||||
|
|
||||||
# it looks like all of the watermarks are at the end?
|
|
||||||
better_content = better_content[:-160]
|
|
||||||
|
|
||||||
# "Accessed on dd/mm/yyy hh:mm"
|
|
||||||
#
|
|
||||||
# the "Accessed" line is only on the first page
|
|
||||||
#
|
|
||||||
# it's based on /F2
|
|
||||||
#
|
|
||||||
# This would be better if it could be decoded to
|
|
||||||
# actually search for the "Accessed" text.
|
|
||||||
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
|
||||||
startpos = better_content.rfind("/F2 11 Tf\n")
|
|
||||||
endpos = better_content.find("Tf\n", startpos+5)
|
|
||||||
|
|
||||||
|
# remove the date
|
||||||
|
startpos = better_content.find("This content downloaded ")
|
||||||
|
endpos = better_content.find(")", startpos)
|
||||||
|
segment = better_content[startpos:endpos]
|
||||||
if verbose >= 2 and replacements:
|
if verbose >= 2 and replacements:
|
||||||
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, segment))
|
||||||
|
|
||||||
better_content = better_content[0:startpos] + better_content[endpos:]
|
better_content = better_content.replace(segment, "")
|
||||||
|
|
||||||
replacements.append([objid, better_content])
|
# it looks like all of the watermarks are at the end?
|
||||||
|
better_content = better_content[:-160]
|
||||||
|
|
||||||
page_id += 1
|
# "Accessed on dd/mm/yyy hh:mm"
|
||||||
|
#
|
||||||
|
# the "Accessed" line is only on the first page
|
||||||
|
#
|
||||||
|
# it's based on /F2
|
||||||
|
#
|
||||||
|
# This would be better if it could be decoded to
|
||||||
|
# actually search for the "Accessed" text.
|
||||||
|
if page_id == 0 and "/F2 11 Tf\n" in better_content:
|
||||||
|
startpos = better_content.rfind("/F2 11 Tf\n")
|
||||||
|
endpos = better_content.find("Tf\n", startpos+5)
|
||||||
|
|
||||||
|
if verbose >= 2 and replacements:
|
||||||
|
sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, cls.requirements, better_content[startpos:endpos]))
|
||||||
|
|
||||||
|
better_content = better_content[0:startpos] + better_content[endpos:]
|
||||||
|
|
||||||
|
replacements.append([objid, better_content])
|
||||||
|
|
||||||
|
page_id += 1
|
||||||
|
except PDFObjectNotFound as e:
|
||||||
|
print >>sys.stderr, 'Missing object: %r' % e
|
||||||
|
|
||||||
if verbose >= 1 and replacements:
|
if verbose >= 1 and replacements:
|
||||||
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
sys.stderr.write("%s: Found objects %s with %r; omitting..." % (cls.__name__, [deets[0] for deets in replacements], cls.requirements))
|
||||||
|
|
|
@ -42,8 +42,7 @@ class RoyalSocietyOfChemistry(Plugin):
|
||||||
pdf = parse_content(content)
|
pdf = parse_content(content)
|
||||||
|
|
||||||
# get a list of all object ids
|
# get a list of all object ids
|
||||||
xrefs = pdf._parser.read_xref()
|
xref = pdf.xrefs[0]
|
||||||
xref = xrefs[0]
|
|
||||||
objids = xref.get_objids()
|
objids = xref.get_objids()
|
||||||
|
|
||||||
# check each object in the pdf
|
# check each object in the pdf
|
||||||
|
|
49
pdfparanoia/plugins/sciencemagazine.py
Normal file
49
pdfparanoia/plugins/sciencemagazine.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from copy import copy
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from ..parser import parse_content
|
||||||
|
from ..eraser import remove_object_by_id
|
||||||
|
from ..plugin import Plugin
|
||||||
|
|
||||||
|
class ScienceMagazine(Plugin):
|
||||||
|
"""
|
||||||
|
Science Magazine
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Remove ads from academic papers. :(
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: better confirmation that the paper is from sciencemag. Look for
|
||||||
|
# "oascentral" in one of the URIs, since the ads are all hyperlinked to
|
||||||
|
# that server.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrub(cls, content, verbose=0):
|
||||||
|
evil_ids = []
|
||||||
|
|
||||||
|
# parse the pdf into a pdfminer document
|
||||||
|
pdf = parse_content(content)
|
||||||
|
|
||||||
|
# get a list of all object ids
|
||||||
|
xref = pdf.xrefs[0]
|
||||||
|
objids = xref.get_objids()
|
||||||
|
|
||||||
|
# check each object in the pdf
|
||||||
|
for objid in objids:
|
||||||
|
# get an object by id
|
||||||
|
obj = pdf.getobj(objid)
|
||||||
|
|
||||||
|
if hasattr(obj, "attrs"):
|
||||||
|
if ("Width" in obj.attrs) and str(obj.attrs["Width"]) == "432":
|
||||||
|
if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230":
|
||||||
|
evil_ids.append(objid)
|
||||||
|
|
||||||
|
if len(evil_ids) > 1:
|
||||||
|
raise Exception("too many ads detected on the page, please double check?")
|
||||||
|
|
||||||
|
for objid in evil_ids:
|
||||||
|
content = remove_object_by_id(content, objid)
|
||||||
|
|
||||||
|
return content
|
|
@ -1 +1 @@
|
||||||
pdfminer>=0
|
pdfminer>=20131113
|
||||||
|
|
20
setup.py
20
setup.py
|
@ -1,12 +1,26 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
|
|
||||||
|
import pdfparanoia
|
||||||
|
|
||||||
long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read()
|
long_description = open(os.path.join(os.path.dirname(__file__), "README.md")).read()
|
||||||
|
|
||||||
|
# pdfminer isn't cross-version compatible but a py3k port is in PyPI
|
||||||
|
if platform.python_version() >= "3.0.0":
|
||||||
|
dependencies = ["pdfminer3k>=1.3.0"]
|
||||||
|
else:
|
||||||
|
dependencies = ["pdfminer>=20131113"]
|
||||||
|
|
||||||
|
packages = [
|
||||||
|
"pdfparanoia",
|
||||||
|
"pdfparanoia.plugins",
|
||||||
|
]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="pdfparanoia",
|
name="pdfparanoia",
|
||||||
version="0.0.14",
|
version=pdfparanoia.__version__,
|
||||||
url="https://github.com/kanzure/pdfparanoia",
|
url="https://github.com/kanzure/pdfparanoia",
|
||||||
license="BSD",
|
license="BSD",
|
||||||
author="Bryan Bishop",
|
author="Bryan Bishop",
|
||||||
|
@ -15,7 +29,9 @@ setup(
|
||||||
maintainer_email="kanzure@gmail.com",
|
maintainer_email="kanzure@gmail.com",
|
||||||
description="pdf watermark remover library for academic papers",
|
description="pdf watermark remover library for academic papers",
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
install_requires=["pdfminer>=0"],
|
long_description_content_type="text/markdown",
|
||||||
|
install_requires=dependencies,
|
||||||
|
packages=packages,
|
||||||
scripts=["bin/pdfparanoia"],
|
scripts=["bin/pdfparanoia"],
|
||||||
platforms="any",
|
platforms="any",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
|
|
@ -8,13 +8,13 @@ class JSTORTestCase(unittest.TestCase):
|
||||||
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
|
file_handler = open("tests/samples/jstor/231a515256115368c142f528cee7f727.pdf", "rb")
|
||||||
content = file_handler.read()
|
content = file_handler.read()
|
||||||
file_handler.close()
|
file_handler.close()
|
||||||
self.assertIn("\n18 0 obj \n", content)
|
self.assertIn(b"\n18 0 obj \n", content)
|
||||||
|
|
||||||
# this section will later be manipulated
|
# this section will later be manipulated
|
||||||
self.assertIn("\n19 0 obj \n", content)
|
self.assertIn(b"\n19 0 obj \n", content)
|
||||||
|
|
||||||
output = pdfparanoia.plugins.JSTOR.scrub(content)
|
output = pdfparanoia.plugins.JSTOR.scrub(content)
|
||||||
|
|
||||||
# FlateDecode should be replaced with a decompressed section
|
# FlateDecode should be replaced with a decompressed section
|
||||||
self.assertIn("\n19 0 obj\n<</Length 2862>>stream", output)
|
self.assertIn(b"\n19 0 obj\n<</Length 2862>>stream", output)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user