| | This I generated with Help Of Author-Mitch B and ChatGPT WORKING
#Subject: Re: picture tag with source subtags
#Author: Mitch B
#Date: 06/22/2017 23:41
#
#Here is a python script that I wrote that removes all the srcset tags that
you
#can use after your website is downloaded.
#Given a PATH go through all the html files and delete all of the srcset
subtags for img tags.
#files will be overwritten
from bs4 import BeautifulSoup
import os
from glob import glob
import codecs
import sys
# Python 2/3 compatible tkinter import
try:
import tkinter as tk
from tkinter import messagebox
except ImportError:
import Tkinter as tk
import tkMessageBox as messagebox
#PATH = "<downloaded website path>"
# Keep File in Project Folder (Not Inside Website Folder and Run
# >>> Auto detect HTTrack website folder (first folder next to this script)
#script_dir = os.path.dirname(os.path.abspath(__file__))
#subdirs = [d for d in os.listdir(script_dir) if
os.path.isdir(os.path.join(script_dir, d))] #Remove this line if Put in
Website Folder
# Python 2/3 compatible __file__ handling
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
# >>> Auto detect HTTrack website folder (first folder next to this script)
# Remove this block entirely if script is placed *inside* website folder
subdirs = []
for d in os.listdir(script_dir):
full = os.path.join(script_dir, d)
if os.path.isdir(full) and not d.startswith('.'):
subdirs.append(d)
if not subdirs:
print("No website folder found next to script.")
sys.exit(1)
PATH = os.path.join(script_dir, subdirs[0])
# <<<
# GUI confirmation box
----------------------------------------------------------------
root = tk.Tk()
root.withdraw()# Hide main window
msg = "Script will modify HTML files in:\n\n{}\n\nProceed?".format(PATH)
answer = messagebox.askokcancel("Confirm Path", msg)
if not answer:
print("Operation cancelled by user.")
sys.exit(0)
# GUI confirmation box
----------------------------------------------------------------
#result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]
print("\nScanning for HTML files...\n")
result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]
total_files = len(result)
print("Total HTML files found: {}\n".format(total_files))
print("Beginning srcset removal...\n")
#for filename in result:
# print(filename)
# file = codecs.open(filename,'r','utf-8')
# data = file.read()
# soup = BeautifulSoup(data, 'html.parser')
#
# for p in soup.find_all('img'):
# if 'srcset' in p.attrs:
# del p.attrs['srcset']
#
# file.close()
# file1 = codecs.open(filename,'w','utf-8')
# file1.write(soup.prettify())
# file1.close()
processed = 0
modified = 0
for filename in result:
print("[PROCESSING] {}".format(filename))
processed += 1
file = codecs.open(filename,'r','utf-8')
data = file.read()
soup = BeautifulSoup(data, 'html.parser')
changed = False
# Remove srcset from <img>
for p in soup.find_all('img'):
if 'srcset' in p.attrs:
del p.attrs['srcset']
changed = True
# Remove srcset from <source>
for s in soup.find_all('source'):
if 'srcset' in s.attrs:
del s.attrs['srcset']
changed = True
file.close()
if changed:
file1 = codecs.open(filename,'w','utf-8')
file1.write(soup.prettify())
file1.close()
modified += 1
print(" [UPDATED]")
else:
print(" [NO CHANGE]")
print("\n====================================")
print("Completed srcset cleanup")
print("Total files scanned : {}".format(processed))
print("Files modified : {}".format(modified))
print("====================================\n")
# Pause so terminal does not close immediately
print("\nPress any key to exit...")
try:
# Windows + Python2
import msvcrt
msvcrt.getch()
except ImportError:
try:
# Python3 Unix-like environments
input()
except:
pass
| |