| | Modified and Fixed Errors of Non-UTF Decoding
HTTrack Works Fine in downloading without this bug (Need to Set Settings
Correctly) ... better than Commercial Programs Like WebZip,IDM-Grabber,Offline
Explorer and Free - Cyotek WebCopy
(Also Images in <https://www.editpadpro.com/> wont load -
<https://www.editpadpro.com//img/icon100.png> wont link)(Is there any way to
delete/modify Reply)
Heres is revised Working Python Code (Always can Ask ChatGPT To Improve It)
#Subject: Re: picture tag with source subtags
#Author: Mitch B
#Date: 06/22/2017 23:41
#
#Here is a python script that I wrote that removes all the srcset tags that
you
#can use after your website is downloaded.
#Given a PATH go through all the html files and delete all of the srcset
subtags for img tags.
#files will be overwritten
from bs4 import BeautifulSoup
import os
from glob import glob
import codecs
import chardet
import sys
# Python 2/3 compatible tkinter import
try:
import tkinter as tk
from tkinter import messagebox
except ImportError:
import Tkinter as tk
import tkMessageBox as messagebox
#PATH = "<downloaded website path>"
# Keep File in Project Folder (Not Inside Website Folder and Run
# >>> Auto detect HTTrack website folder (first folder next to this script)
#script_dir = os.path.dirname(os.path.abspath(__file__))
#subdirs = [d for d in os.listdir(script_dir) if
os.path.isdir(os.path.join(script_dir, d))] #Remove this line if Put in
Website Folder
# Python 2/3 compatible __file__ handling
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
# >>> Auto detect HTTrack website folder (first folder next to this script)
# Remove this block entirely if script is placed *inside* website folder
subdirs = []
for d in os.listdir(script_dir):
full = os.path.join(script_dir, d)
if os.path.isdir(full) and not d.startswith('.'):
subdirs.append(d)
if not subdirs:
print("No website folder found next to script.")
sys.exit(1)
PATH = os.path.join(script_dir, subdirs[0])
# <<<
# GUI confirmation box
----------------------------------------------------------------
root = tk.Tk()
root.withdraw()# Hide main window
msg = "Script will modify HTML files in:\n\n{}\n\nProceed?".format(PATH)
answer = messagebox.askokcancel("Confirm Path", msg)
if not answer:
print("Operation cancelled by user.")
sys.exit(0)
# GUI confirmation box
----------------------------------------------------------------
#result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]
print("\nScanning for HTML files...\n")
result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]
total_files = len(result)
print("Total HTML files found: {}\n".format(total_files))
print("Beginning srcset removal...\n")
#for filename in result:
# print(filename)
# file = codecs.open(filename,'r','utf-8')
# data = file.read()
# soup = BeautifulSoup(data, 'html.parser')
#
# for p in soup.find_all('img'):
# if 'srcset' in p.attrs:
# del p.attrs['srcset']
#
# file.close()
# file1 = codecs.open(filename,'w','utf-8')
# file1.write(soup.prettify())
# file1.close()
processed = 0
modified = 0
for filename in result:
print("[PROCESSING] {}".format(filename))
processed += 1
#file = codecs.open(filename,'r','utf-8')
#data = file.read()
#File "<frozen codecs>", line 710, in read
#File "<frozen codecs>", line 510, in read
#UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position
57: invalid start byte
# Read file safely
try:
with open(filename, 'rb') as f:
raw = f.read()
except:
print("[ERROR] Cannot read file")
continue
# Decode with fallback
try:
data = raw.decode('utf-8') # Try UTF-8 - 85%
except UnicodeDecodeError:
try:
data = raw.decode('latin-1')# Try Latin-1 fallback -10%
except:
# Detect encoding - Remaining ~5%
detect = chardet.detect(raw)
enc = detect.get('encoding', 'latin-1')
data = raw.decode(enc, errors='ignore')
soup = BeautifulSoup(data, 'html.parser')
changed = False
for tag in soup.find_all(['img', 'source']):
if 'srcset' in tag.attrs:
del tag.attrs['srcset']
changed = True
# changed = False
# # Remove srcset from <img>
# for p in soup.find_all('img'):
# if 'srcset' in p.attrs:
# del p.attrs['srcset']
# changed = True
#
# # Remove srcset from <source>
# for s in soup.find_all('source'):
# if 'srcset' in s.attrs:
# del s.attrs['srcset']
# changed = True
#file.close()
if changed:
#file1 = codecs.open(filename,'w','utf-8')
#file1.write(soup.prettify())
# Write updated HTML
try:
with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
f.write(soup.prettify())
except:
print("[ERROR] Cannot write file")
continue
#file1.close()
modified += 1
print(" [UPDATED]")
else:
print(" [NO CHANGE]")
print("\n====================================")
print("Completed srcset cleanup")
print("Total files scanned : {}".format(processed))
print("Files modified : {}".format(modified))
print("====================================\n")
# Pause so terminal does not close immediately
print("\nPress any key to exit...")
try:
# Windows + Python2
import msvcrt
msvcrt.getch()
except ImportError:
try:
# Python3 Unix-like environments
input()
except:
pass
| |