Re: picture tag with source subtags - HTTrack Website Copier Forum

Subject: Re: picture tag with source subtags

Author: _pA89

Date: 11/03/2025 22:52

Modified and Fixed Errors of Non-UTF Decoding
HTTrack Works Fine in downloading without this bug (Need to Set Settings
Correctly) ... better than Commercial Programs Like WebZip,IDM-Grabber,Offline
Explorer and Free - Cyotek WebCopy
(Also Images in <https://www.editpadpro.com/> wont load -
<https://www.editpadpro.com//img/icon100.png> wont link)(Is there any way to
delete/modify Reply)

Heres is revised Working Python Code (Always can Ask ChatGPT To Improve It)


#Subject: Re: picture tag with source subtags
#Author: Mitch B
#Date: 06/22/2017 23:41
# 	
#Here is a python script that I wrote that removes all the srcset tags that
you
#can use after your website is downloaded.  

#Given a PATH go through all the html files and delete all of the srcset
subtags for img tags.
#files will be overwritten

from bs4 import BeautifulSoup
import os
from glob import glob
import codecs
import chardet
import sys

# Python 2/3 compatible tkinter import
try:
    import tkinter as tk
    from tkinter import messagebox
except ImportError:
    import Tkinter as tk
    import tkMessageBox as messagebox


#PATH = "<downloaded website path>"

# Keep File in Project Folder (Not Inside Website Folder and Run 
# >>> Auto detect HTTrack website folder (first folder next to this script)
#script_dir = os.path.dirname(os.path.abspath(__file__))
#subdirs = [d for d in os.listdir(script_dir) if
os.path.isdir(os.path.join(script_dir, d))] #Remove this line if Put in
Website Folder

# Python 2/3 compatible __file__ handling
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))

# >>> Auto detect HTTrack website folder (first folder next to this script)
# Remove this block entirely if script is placed *inside* website folder
subdirs = []
for d in os.listdir(script_dir):
    full = os.path.join(script_dir, d)
    if os.path.isdir(full) and not d.startswith('.'):
        subdirs.append(d)

if not subdirs:
    print("No website folder found next to script.")
    sys.exit(1)

PATH = os.path.join(script_dir, subdirs[0])
# <<<


# GUI confirmation box
----------------------------------------------------------------
root = tk.Tk()
root.withdraw()# Hide main window

msg = "Script will modify HTML files in:\n\n{}\n\nProceed?".format(PATH)
answer = messagebox.askokcancel("Confirm Path", msg)

if not answer:
    print("Operation cancelled by user.")
    sys.exit(0)
# GUI confirmation box
----------------------------------------------------------------    



#result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]

print("\nScanning for HTML files...\n")
result = [y for x in os.walk(PATH) for y in
glob(os.path.join(x[0],'*.html'))]
total_files = len(result)

print("Total HTML files found: {}\n".format(total_files))
print("Beginning srcset removal...\n")



#for filename in result:
#    print(filename)
#    file = codecs.open(filename,'r','utf-8')
#    data = file.read()
#    soup = BeautifulSoup(data, 'html.parser')
#
#    for p in soup.find_all('img'):
#        if 'srcset' in p.attrs:
#            del p.attrs['srcset']
#
#    file.close()
#    file1 = codecs.open(filename,'w','utf-8')
#    file1.write(soup.prettify())
#    file1.close()



processed = 0
modified = 0

for filename in result:
    print("[PROCESSING] {}".format(filename))
    processed += 1

    #file = codecs.open(filename,'r','utf-8')
    #data = file.read()

        #File "<frozen codecs>", line 710, in read
        #File "<frozen codecs>", line 510, in read
        #UnicodeDecodeError: 'utf-8' codec can't decode byte 0x83 in position
57: invalid start byte
        
    # Read file safely
    try:
        with open(filename, 'rb') as f:
            raw = f.read()
    except:
        print("[ERROR] Cannot read file")
        continue
    
    # Decode with fallback
    try:
        data = raw.decode('utf-8') # Try UTF-8 - 85%
    except UnicodeDecodeError:
        try:
            data = raw.decode('latin-1')# Try Latin-1 fallback -10%
        except:
            # Detect encoding - Remaining ~5%
            detect = chardet.detect(raw)
            enc = detect.get('encoding', 'latin-1')
            data = raw.decode(enc, errors='ignore')
    
    soup = BeautifulSoup(data, 'html.parser')


    changed = False
    
    for tag in soup.find_all(['img', 'source']):
        if 'srcset' in tag.attrs:
            del tag.attrs['srcset']
            changed = True
            
#    changed = False
#    # Remove srcset from <img>
#    for p in soup.find_all('img'):
#        if 'srcset' in p.attrs:
#            del p.attrs['srcset']
#            changed = True
#    
#    # Remove srcset from <source>
#    for s in soup.find_all('source'):
#        if 'srcset' in s.attrs:
#            del s.attrs['srcset']
#            changed = True

    #file.close()

    if changed:
        #file1 = codecs.open(filename,'w','utf-8')
        #file1.write(soup.prettify())
        # Write updated HTML
        try:
            with open(filename, 'w', encoding='utf-8', errors='ignore') as f:
                f.write(soup.prettify())
        except:
            print("[ERROR] Cannot write file")
            continue
        #file1.close()
        modified += 1
        print("    [UPDATED]")
    else:
        print("    [NO CHANGE]")

print("\n====================================")
print("Completed srcset cleanup")
print("Total files scanned : {}".format(processed))
print("Files modified      : {}".format(modified))
print("====================================\n")

# Pause so terminal does not close immediately
print("\nPress any key to exit...")

try:
    # Windows + Python2
    import msvcrt
    msvcrt.getch()
except ImportError:
    try:
        # Python3 Unix-like environments
        input()
    except:
        pass

Create subthread

All articles

Subject	Author	Date
picture tag with source subtags		03/24/2015 21:21
Re: picture tag with source subtags		06/01/2015 11:52
Re: picture tag with source subtags		01/05/2016 09:09
Re: picture tag with source subtags		06/22/2017 23:41
Re: picture tag with source subtags		09/20/2017 13:05
Re: picture tag with source subtags		06/27/2018 09:33
Re: picture tag with source subtags		03/23/2019 21:41
Re: picture tag with source subtags		11/03/2025 21:16
Re: picture tag with source subtags		11/03/2025 22:52