Check PDF is from the RSC before cleaning

2025-06-13 19:42:01 +02:00 · 2013-05-13 21:01:52 +01:00 · 2013-05-13 21:01:52 +01:00 · c673d77ec6
commit c673d77ec6
parent 18140d838d
1 changed files with 28 additions and 25 deletions
--- a/pdfparanoia/plugins/rsc.py
+++ b/pdfparanoia/plugins/rsc.py
@ -35,35 +35,38 @@ class RoyalSocietyOfChemistry(Plugin):
            #"Table of Contents for this issue",
        ]
-        # parse the pdf into a pdfminer document
+        # Confirm the PDF is from the RSC
-        pdf = parse_content(content)
+        if "pubs.rsc.org" in content:
            # parse the pdf into a pdfminer document
            pdf = parse_content(content)
-        # get a list of all object ids
+            # get a list of all object ids
-        xrefs = pdf._parser.read_xref()
+            xrefs = pdf._parser.read_xref()
-        xref = xrefs[0]
+            xref = xrefs[0]
-        objids = xref.get_objids()
+            objids = xref.get_objids()
-        # check each object in the pdf
+            # check each object in the pdf
-        for objid in objids:
+            for objid in objids:
-            # get an object by id
+                # get an object by id
-            obj = pdf.getobj(objid)
+                obj = pdf.getobj(objid)
-            if hasattr(obj, "attrs"):
+                if hasattr(obj, "attrs"):
-                # watermarks tend to be in FlateDecode elements
+                    # watermarks tend to be in FlateDecode elements
-                if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
+                    if obj.attrs.has_key("Filter") and str(obj.attrs["Filter"]) == "/FlateDecode":
-                    rawdata = copy(obj.rawdata)
+                        rawdata = copy(obj.rawdata)
-                    data = copy(obj.get_data())
+                        data = copy(obj.get_data())
-                    # Check if any of the watermarks are in the current object
+                        # Check if any of the watermarks are in the current object
-                    for phrase in watermarks:
+                        for phrase in watermarks:
-                        if phrase in data:
+                            if phrase in data:
-                            if verbose >= 2:
+                                if verbose >= 2:
-                                sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
+                                    sys.stderr.write("%s: Found object %s with %r: %r; omitting...\n" % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000]))
-                            elif verbose >= 1:
+                                elif verbose >= 1:
-                                sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) 
+                                    sys.stderr.write("%s: Found object %s with %r; omitting...\n" % (cls.__name__, objid, phrase)) 
-                            
+                                
-                            # We had a match so replace the watermark data with an empty string                 
+                                # We had a match so replace the watermark data with an empty string                 
-                            replacements.append([rawdata, ""])
+                                replacements.append([rawdata, ""])
        for deets in replacements:
            # Directly replace the stream data in binary encoded object