#!/usr/bin/env python3 """ Metropoli BBS - PCBIndex to HTML Converter (Retro Edition) ========================================================== This script generates static HTML index files for a directory tree, mimicking the look and feel of a DOS-era BBS file listing. Features: - Recursive directory traversal (bottom-up). - Parses legacy PCBoard file descriptions (_INDEX_ / 00_INDEX.TXT). - Extracts and cleans descriptions from FILE_ID.DIZ in unpacked archives. (Reads from .src folder, links to web folder). - Unified "Retro Text" pipeline: - Strips ANSI codes and PCB Color Codes (@Xnn) at the byte level. - Detects encodings: SF7 (Finnish 7-bit), CP437 (DOS), UTF-8, Latin1. - Transcodes everything to valid UTF-8 HTML. - STRICT Whitespace Handling: - Removes trailing whitespace. - Removes vertical padding (empty lines at start/end). - PRESERVES leading whitespace (indentation) for ASCII art. - Responsive retro styling: - Uses embedded "Perfect DOS" web font (locally hosted in /retrofonts/). - Dynamic width (110ch vs 132ch) based on content length. - Forces 110ch width for specific curated directories. - Advanced breadcrumb navigation for unpacked trees (Archive boundary detection). """ import os import re import html import datetime import urllib.parse import fnmatch import math import stat # ============================================================================= # CONFIGURATION # ============================================================================= ROOT_DIR = "/home/ftp" # Destination for clickable links (HTML versions generated by viewer) UNPACKED_WEB_ROOT = "/home/ftp/unpacked" # Source for reading metadata (Raw files) UNPACKED_SRC_ROOT = "/home/ftp/unpacked.src" # Files and directories to exclude from the generated index. # CASE INSENSITIVE matching. EXCLUDED_PATTERNS = { ".*", # Hidden files "_INDEX*", # PCBoard Index source files "INDEX.*", "00_INDEX.TXT", "CORE", # Linux core dumps "*.desc", # Internal description files "mpolilogo*.*", # Website assets "starportlogo*", # Website assets (Starport variant) "index-style", "favicon.ico", "files", "sitemap*.txt", "robots.txt", "pub", "DESCRIPT.ION", # 4DOS descriptions "_index*", "index.html", # The file we are generating "index.htm", "retrofonts", # Exclude font directory "unpacked.src" # Exclude raw source of unpacked files from indexing } # Mapping file extensions to their BBS-era archiver commands. # Spaces here are preserved in the HTML via 'white-space: pre'. ARCHIVER_MAP = { '.ZIP': '[ PKUNZIP ]', '.ARJ': '[ ARJ x ]', '.LZH': '[ LHARC e ]', '.LHA': '[ LHARC e ]', '.RAR': '[ UNRAR ]', '.ZOO': '[ ZOO x ]', '.ARC': '[ ARC e ]', '.PAK': '[ PAK e ]', '.EXE': '[ UNPACK ]' } # 1990-01-01 00:00:00 Timestamp. # Used as a fallback for missing dates or to clamp "future" dates. FIXED_TIMESTAMP = 631152000.0 # Cutoff Year for Date Sanity Check. # Files newer than this (e.g., generated logs) get clamped to 1990 styling. MAX_VALID_YEAR = 2015 # ============================================================================= # RETRO TEXT PROCESSING LOGIC # ============================================================================= # Regex to identify ANSI escape sequences (colors, cursor moves). ANSI_BYTES_REGEX = re.compile(rb'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') # Regex to identify PCBoard Color Codes (e.g., @X0A, @X1F). # Matches @X followed by two hex digits. Case insensitive. PCB_BYTES_REGEX = re.compile(rb'@X[0-9A-F]{2}', re.IGNORECASE) # Standard Word Regex (2+ letters) used for text density analysis. WORD_REGEX = re.compile(rb'\b[a-zA-Z]{2,}\b') def is_binary_safe(content): """ Determines if a byte string is likely binary garbage rather than text. Checks for high density of control codes (excluding tabs/newlines). """ if not content: return False # Check for bytes < 9 (excluding Null sometimes used in text padding) # and 14-26 (Shift Out, DLE, etc). bad = sum(1 for b in content if (b < 9 and b != 0) or (14 <= b <= 26)) return bad > (len(content) * 0.10) def detect_sf7(raw_bytes): """ Detects Finnish SF7 (ISO 646-FI) encoding. SF7 uses 7-bit chars (e.g. '{') to represent Scandis. If high-bit chars (>127) exist, it is strictly NOT SF7. """ total_len = len(raw_bytes) if total_len == 0: return False # SAFETY 1: SF7 is a 7-bit encoding. If we see 8-bit bytes (CP437 blocks, Latin1 chars), # it cannot be SF7. This protects CP437 art and Latin1 text. if any(b > 127 for b in raw_bytes): return False # SAFETY 2: Word Density. # We relax this for short descriptions (under 100 bytes) because # "Very good!" has high density, but "[]" has low. if total_len > 100: total_words_list = WORD_REGEX.findall(raw_bytes) total_words = len(total_words_list) if (total_words / total_len) < 0.05: return False # Heuristic: Ratio of '{' (ä) to normal letters. sf7_marker_count = raw_bytes.count(b'{') letter_count = sum(1 for b in raw_bytes if (65 <= b <= 90) or (97 <= b <= 122)) if letter_count == 0: return False # Threshold: '{' must represent > 2.5% of all letters. return (sf7_marker_count / letter_count) > 0.025 def detect_encoding(raw_bytes, source_filename=""): """ Determines the likely encoding of the byte stream. Prioritizes: SF7 -> ASCII -> UTF-8 -> CP437 -> Latin1. """ # 0. Special Case: 00_INDEX.TXT is known to be CP437 (or ASCII). # Never SF7. is_00_index = "00_INDEX" in source_filename.upper() # 1. Check for SF7 # Skip if source is 00_INDEX or if detection fails if not is_00_index and detect_sf7(raw_bytes): return 'SF7' # 2. Check for Pure ASCII (7-bit clean) if not any(b > 127 for b in raw_bytes): return 'ASCII' # 3. Check for valid UTF-8 try: raw_bytes.decode('utf-8') return 'UTF-8' except UnicodeDecodeError: pass # 4. Fallback: Distinguish CP437 (DOS Art) from Latin1 (Windows/Unix) cp437_score = 0 latin1_score = 0 # If it's 00_INDEX.TXT, we bias heavily towards CP437 if is_00_index: cp437_score += 100 for b in raw_bytes: if 0x80 <= b <= 0x9F: cp437_score += 5 # CP437 block drawing chars elif 0xB0 <= b <= 0xDF: cp437_score += 1 # CP437 shading blocks elif b in [0xC4, 0xD6, 0xC5, 0xE4, 0xF6, 0xE5]: latin1_score += 3 # Scandi Latin1 elif b == 0xA2: cp437_score += 2 # Cent symbol return 'CP437' if cp437_score >= latin1_score else 'LATIN1' def transcode_text(raw_bytes, encoding): """ Converts the raw bytes from the detected source encoding into a clean UTF-8 string. """ enc = encoding.lower() if enc == 'sf7': text = raw_bytes.decode('ascii', errors='replace') # Map SF7 chars to Unicode return text.translate(str.maketrans({ '[': 'Ä', '\\': 'Ö', ']': 'Å', '{': 'ä', '|': 'ö', '}': 'å', '~': 'ü' })) elif enc == 'cp437': return raw_bytes.decode('cp437', errors='replace') elif enc == 'utf-8': return raw_bytes.decode('utf-8', errors='replace') elif enc == 'ascii': return raw_bytes.decode('ascii', errors='replace') else: return raw_bytes.decode('latin1', errors='replace') def process_retro_text(raw_bytes, source_filename=""): """ Master function to clean and convert description text. 1. Strips ANSI & PCB codes (bytes) and DOS EOF. 2. Detects encoding & Transcodes (UTF-8). 3. Trims whitespace (safe for ASCII art). 4. Escapes HTML. """ if not raw_bytes: return "" # Strip ANSI escape sequences clean_bytes = ANSI_BYTES_REGEX.sub(b'', raw_bytes) # Strip PCBoard Color Codes (@Xnn) clean_bytes = PCB_BYTES_REGEX.sub(b'', clean_bytes) # Strip DOS EOF Char (0x1A / 26) - Common in old text files clean_bytes = clean_bytes.replace(b'\x1a', b'') # Safety check if is_binary_safe(clean_bytes): return "[Binary Data]" encoding = detect_encoding(clean_bytes, source_filename=source_filename) utf8_text = transcode_text(clean_bytes, encoding) # Whitespace Cleanup: # 1. Right-Strip EVERY line (Remove trailing whitespace) # We do NOT lstrip() here to preserve indentation/centering. lines = [line.rstrip() for line in utf8_text.splitlines()] # 2. Vertical Trim (Remove empty lines from Start) while lines and not lines[0]: lines.pop(0) # 3. Vertical Trim (Remove empty lines from End) while lines and not lines[-1]: lines.pop() # 4. Rejoin with clean newlines (strip vertical padding of result) final_text = "\n".join(lines) return html.escape(final_text) # ============================================================================= # HTML TEMPLATES # ============================================================================= HTML_HEADER = """ {header_title} - {raw_path}
{header_title}
{breadcrumb_html}
{context_link_html}
""" HTML_FOOTER = """
FILENAME SIZE DATE DESCRIPTION
 TOTAL: {file_count}
""" # ============================================================================= # HELPER FUNCTIONS # ============================================================================= def safe_str(s): """ Encodes string to UTF-8 properly for HTML injection. Handles 'surrogateescape' artifacts from os.listdir (e.g., lone surrogates like \udcaf) by reverting them to bytes and decoding with replacement characters. """ if s is None: return "" try: # Step 1: Revert 'surrogateescape' decoding (common in os.listdir on Linux) # to get original raw bytes. raw_bytes = s.encode('utf-8', 'surrogateescape') except UnicodeError: # If surrogates are somehow complex or mixed, force replacement during encode raw_bytes = s.encode('utf-8', 'replace') # Step 2: Decode as UTF-8 with replacement for errors (invalid UTF-8 bytes) # This guarantees the result is a valid UTF-8 string without surrogates. return raw_bytes.decode('utf-8', 'replace') def force_clean(s): """ Last-resort sanitizer used before file writing. Ensures absolute UTF-8 validity by stripping any lingering surrogates. """ if not s: return "" return s.encode('utf-8', 'replace').decode('utf-8') def safe_href(s): """URL-encodes a path component.""" if s is None: return "" return urllib.parse.quote(os.fsencode(s)) def is_excluded(filename): """Checks if a filename matches any of the exclusion patterns.""" fname = filename.upper() for pattern in EXCLUDED_PATTERNS: if fnmatch.fnmatch(fname, pattern.upper()): return True return False def is_valid_file_object(path): """Checks if path is a regular file or directory (and exists).""" try: st = os.lstat(path) if stat.S_ISLNK(st.st_mode): return False if stat.S_ISREG(st.st_mode) or stat.S_ISDIR(st.st_mode): return True return False except OSError: return False def is_effectively_empty(path): """ Recursively checks if a directory is empty or contains only excluded files. """ for root, dirs, files in os.walk(path): dirs[:] = [d for d in dirs if is_valid_file_object(os.path.join(root, d))] for f in files: fp = os.path.join(root, f) if not is_valid_file_object(fp): continue if not is_excluded(f): return False return True def read_file_bytes_safe(path): """Attempts to read a file as raw bytes. Returns None on failure.""" try: with open(path, 'rb') as f: return f.read() except Exception: return None def format_size(size_bytes): """Formats bytes into human readable string strictly with 2 decimals.""" if size_bytes == 0: return "0.00 B" size_name = ("B", "KB", "MB", "GB", "TB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = size_bytes / p return "%.2f %s" % (s, size_name[i]) def get_folder_size(start_path): """Calculates total size of a directory (excluding excluded files).""" total_size = 0 for dirpath, dirnames, filenames in os.walk(start_path): for f in filenames: if not is_excluded(f): fp = os.path.join(dirpath, f) if is_valid_file_object(fp): try: total_size += os.path.getsize(fp) except OSError: pass return total_size def backup_existing_indices(full_path): """ Backs up existing index.html files if they weren't generated by this tool. """ targets = ["index.html", "index.htm", "INDEX.HTML", "INDEX.HTM"] our_markers = [b'content="Metropoli-PCB-Indexer"'] for target in targets: target_path = os.path.join(full_path, target) if is_valid_file_object(target_path): is_ours = False try: with open(target_path, 'rb') as f: head = f.read(4096) for marker in our_markers: if marker in head: is_ours = True break except: pass if not is_ours: new_name = "_" + target new_path = os.path.join(full_path, new_name) try: os.replace(target_path, new_path) except OSError: pass def get_better_desc(rel_dir_path, filename): """ Looks for a FILE_ID.DIZ inside the corresponding unpacked directory. Uses process_retro_text to clean it up. CRITICAL: Reads metadata from UNPACKED_SRC_ROOT (raw text). """ lower_rel = rel_dir_path.lower() lower_filename = filename.lower() # Check src path for reading src_path = os.path.join(UNPACKED_SRC_ROOT, lower_rel, lower_filename) # We check if the folder exists in source to try and read the DIZ if not os.path.isdir(src_path) or os.path.islink(src_path): return None candidates = ["file_id.old", "FILE_ID.OLD", "file_id.diz", "FILE_ID.DIZ"] for candidate in candidates: candidate_path = os.path.join(src_path, candidate) if is_valid_file_object(candidate_path): raw_content = read_file_bytes_safe(candidate_path) if raw_content: # DIZ files are usually CP437 or SF7, treated generally desc_html = process_retro_text(raw_content, source_filename=candidate) # DO NOT STRIP() HERE! strict cleanup is already done in process_retro_text return desc_html return None def generate_breadcrumb(rel_path): """ Generates the navigation breadcrumb HTML in 'C:\DIR' style. Handles 'unpacked' directories specially to redirect intermediate paths back to the compressed file index and HIDE the 'UNPACKED' level. """ if not rel_path or rel_path == ".": return 'C:' parts = rel_path.split(os.sep) html_parts = [] # Root Link up_levels = len(parts) root_link = "../" * up_levels html_parts.append(f'C:') # Detect if we are inside the 'unpacked' tree is_unpacked = (parts[0].lower() == 'unpacked') # Flag to track if we have crossed the boundary from "Folder Structure" to "Inside Zip" inside_archive_boundary = False for i, part in enumerate(parts): # Handle "UNPACKED" word - Skip it entirely from display if is_unpacked and i == 0: continue levels_up = len(parts) - 1 - i # Link construction logic for unpacked tree if is_unpacked: # Reconstruct the path from ROOT to the current part (skipping 'unpacked') source_segments = parts[1:i+1] # Resolve this path in ROOT_DIR case-insensitively to determine # if it points to a real Source Directory or Source File (Archive). # This handles case mismatches (e.g. 'games' vs 'GAMES') which # prevents 'os.path.isdir' from working on Linux. check_path = ROOT_DIR resolved_segments = [] valid_source = True for seg in source_segments: found = False if os.path.isdir(check_path): try: for entry in os.listdir(check_path): if entry.lower() == seg.lower(): check_path = os.path.join(check_path, entry) resolved_segments.append(entry) found = True break except OSError: pass if not found: valid_source = False break is_source_file = False is_source_dir = False if valid_source: if os.path.isfile(check_path): is_source_file = True elif os.path.isdir(check_path): is_source_dir = True # Determine target URL based on whether we are looking at a Directory or a File (Archive) if not inside_archive_boundary: if is_source_file: # It IS the archive file (e.g. doom.zip). # This is the boundary. Link to the Unpacked Index (self). inside_archive_boundary = True # Use source_segments (current unpacked path context) for unpacked links target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html" elif is_source_dir: # It is a directory in the source tree (e.g. GAMES). # Link to the Compressed Index (Source). # Use resolved_segments to ensure we link to the correct Case on disk (e.g. GAMES not games) target_url = ("../" * len(parts)) + "/".join(safe_href(p) for p in resolved_segments) + "/index.html" else: # Fallback (Inside archive or path mismatch) target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html" else: # We are ALREADY inside the archive (e.g. DATA). # Link to the Unpacked Index (subdir). target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html" html_parts.append(f'{safe_str(part.upper())}') else: # Standard logic for normal tree link_target = "../" * levels_up + "index.html" if levels_up == 0: link_target = "index.html" html_parts.append(f'{safe_str(part.upper())}') # Join with backslash, no spaces return '' + '\\'.join(html_parts) + '' def parse_date(date_str): """Parses MM-DD-YY date string into standardized YYYY-MM-DD.""" try: m, d, y = date_str.split('-') y_int = int(y) full_year = 1900 + y_int if y_int > 50 else 2000 + y_int dt = datetime.datetime(full_year, int(m), int(d)) return dt.strftime("%Y-%m-%d"), int(dt.timestamp()) except: return date_str, 0 def parse_htaccess(full_path): """ Parses .htaccess for AddDescription fields. """ htaccess_path = os.path.join(full_path, ".htaccess") desc_map = {} if is_valid_file_object(htaccess_path): content_bytes = read_file_bytes_safe(htaccess_path) if content_bytes: # .htaccess is usually simple text try: content = content_bytes.decode('utf-8') except UnicodeDecodeError: content = content_bytes.decode('latin-1', errors='replace') lines = content.splitlines() regex = re.compile(r'^\s*AddDescription\s+"([^"]+)"\s+"(?:\*/)?([^"]+)"', re.IGNORECASE) for line in lines: m = regex.search(line) if m: desc_map[m.group(2).upper()] = m.group(1) return desc_map def parse_pcboard_index(index_path): """ Parses a legacy PCBoard _INDEX_ file. """ entries = [] current_entry = None if not is_valid_file_object(index_path): return [] content_bytes = read_file_bytes_safe(index_path) if not content_bytes: return [] # Get filename for detection logic (00_INDEX check) index_filename = os.path.basename(index_path) lines = content_bytes.splitlines() line_regex = re.compile(rb'^([A-Z0-9_\-\.]{1,12})\s+(\d+)\s+(\d{2}-\d{2}-\d{2})\s+(.*)') for idx, line in enumerate(lines): line = line.rstrip() if not line: continue match = line_regex.match(line) if match: if current_entry: entries.append(current_entry) f_name = match.group(1).decode('ascii', errors='ignore') f_size = match.group(2).decode('ascii', errors='ignore') f_date = match.group(3).decode('ascii', errors='ignore') raw_desc = match.group(4) current_entry = { 'filename': f_name, 'size': f_size, 'raw_date': f_date, 'raw_desc_lines': [raw_desc], 'original_order': idx } elif current_entry: # Handle Continuation Lines # We must detect if it's a pipe-separated continuation (standard) # or just a wrapped line. # 1. Check if the line *visually* starts with a pipe if line.lstrip().startswith(b'|'): # Locate the pipe char pidx = line.find(b'|') # Content is everything after the pipe. # We RSTRIP to remove trailing space, but we do NOT LSTRIP the content # to preserve any intended indentation (ASCII art). content = line[pidx+1:].rstrip() current_entry['raw_desc_lines'].append(content) else: # Fallback: No pipe found. We have to assume it's just text. # In this case we strip to be safe, as indentation without a delimiter is unreliable. current_entry['raw_desc_lines'].append(line.strip()) if current_entry: entries.append(current_entry) # Process descriptions for e in entries: full_desc_bytes = b'\n'.join(e['raw_desc_lines']) # Pass the index filename so process_retro_text knows if it's 00_INDEX.TXT e['desc'] = process_retro_text(full_desc_bytes, source_filename=index_filename) return entries def get_unpack_url(rel_dir_path, filename): """ Checks if an archive has a corresponding unpacked folder. Links to the WEB root (HTML version). """ lower_rel = rel_dir_path.lower() lower_filename = filename.lower() # Check Web path for linking web_disk_path = os.path.join(UNPACKED_WEB_ROOT, lower_rel, lower_filename) # We check if the folder exists (even if we ignore it for indexing) if os.path.isdir(web_disk_path) and not os.path.islink(web_disk_path): _, ext = os.path.splitext(filename.upper()) label = ARCHIVER_MAP.get(ext, '[ UNPACK ]') url_path = f"/unpacked/{safe_href(lower_rel)}/{safe_href(lower_filename)}/" return url_path, label return None, None def get_context_link_html(full_path): """ Links from unpacked view back to original file. Adjusted to handle checking within WEB root. """ abs_unpacked = os.path.abspath(UNPACKED_WEB_ROOT) abs_current = os.path.abspath(full_path) if abs_current.startswith(abs_unpacked) and abs_current != abs_unpacked: rel_from_unpacked = os.path.relpath(abs_current, abs_unpacked) path_parts = rel_from_unpacked.split(os.sep) current_phys_path = ROOT_DIR matched_url_parts = [] for part in path_parts: found_name = None try: if os.path.isdir(current_phys_path): for item in os.listdir(current_phys_path): if item.lower() == part.lower(): found_name = item break except OSError: break if found_name: next_phys_path = os.path.join(current_phys_path, found_name) if os.path.isfile(next_phys_path): if not matched_url_parts: href = "/" else: href = "/" + "/".join(safe_href(p) for p in matched_url_parts) + "/" return f'' current_phys_path = next_phys_path matched_url_parts.append(found_name) else: break return "" def update_folder_timestamps(full_path, index_file_path): """ Updates the folder timestamp to match the newest file inside it. Clamps dates > MAX_VALID_YEAR. CRITICAL: This ignores 'index.html', 'index.htm', and '_INDEX_' files to ensure the directory timestamp reflects content, not the generated index itself. """ try: # 1. Reset timestamps of excluded items to fixed date # EXCEPTION: We do NOT reset index.html here, allowing it to stay "Current" try: immediate_items = os.listdir(full_path) for item in immediate_items: # Skip touching the index files we just generated so they stay fresh if item.lower() in ["index.html", "index.htm", "_index_"]: continue if is_excluded(item): item_path = os.path.join(full_path, item) if is_valid_file_object(item_path): try: os.utime(item_path, (FIXED_TIMESTAMP, FIXED_TIMESTAMP)) except OSError: pass except OSError: pass # 2. Find newest content file newest_ts = 0.0 for root, dirs, files in os.walk(full_path): dirs[:] = [d for d in dirs if not is_excluded(d) and is_valid_file_object(os.path.join(root, d))] for f in files: # Explicitly skip index files from calculation if f.lower() in ["index.html", "index.htm", "_index_"]: continue if is_excluded(f): continue file_path = os.path.join(root, f) if file_path == index_file_path: continue if is_valid_file_object(file_path): try: stat_res = os.stat(file_path) # Sanity Check file_year = datetime.datetime.fromtimestamp(stat_res.st_mtime).year if file_year > MAX_VALID_YEAR: # If file is too new (e.g. logs), ignore it for sorting # or clamp it. Here we clamp it to keep folder retro. current_ts = FIXED_TIMESTAMP else: current_ts = stat_res.st_mtime if current_ts > newest_ts: newest_ts = current_ts except OSError: pass for d in dirs: dir_path = os.path.join(root, d) if is_valid_file_object(dir_path): try: stat_res = os.stat(dir_path) if stat_res.st_mtime > newest_ts: newest_ts = stat_res.st_mtime except OSError: pass if newest_ts > 0: os.utime(full_path, (newest_ts, newest_ts)) else: os.utime(full_path, (FIXED_TIMESTAMP, FIXED_TIMESTAMP)) except OSError as e: print(f"Timestamp update failed for {full_path}: {e}") # ============================================================================= # MAIN LOGIC # ============================================================================= def generate_folder_index(full_path): print(f"Processing: {safe_str(full_path)}") backup_existing_indices(full_path) # Locate index source index_file = os.path.join(full_path, "_INDEX_") if not is_valid_file_object(index_file): alt_index = os.path.join(full_path, "00_index.txt") if is_valid_file_object(alt_index): index_file = alt_index # Parse sources parsed_entries = parse_pcboard_index(index_file) indexed_filenames = {e['filename'].upper() for e in parsed_entries} htaccess_desc = parse_htaccess(full_path) try: actual_items = os.listdir(full_path) except OSError: return disk_map_files = {} dir_list = [] file_list = [] context_link_html = get_context_link_html(full_path) # 1. Process Directories & Files on Disk for item in actual_items: full_item_path = os.path.join(full_path, item) # HIDE UNPACKED FOLDERS FROM TOP LEVEL if full_path == ROOT_DIR and item.lower() in ['unpacked', 'unpacked.src']: continue upper_item = item.upper() if not is_valid_file_object(full_item_path): continue if os.path.isdir(full_item_path): if not is_excluded(item): if is_effectively_empty(full_item_path): continue stat_res = os.stat(full_item_path) dt = datetime.datetime.fromtimestamp(stat_res.st_mtime) if dt.year > MAX_VALID_YEAR or int(dt.timestamp()) == int(FIXED_TIMESTAMP): date_str = "" else: date_str = dt.strftime("%Y-%m-%d") desc = htaccess_desc.get(upper_item, "") dir_size = get_folder_size(full_item_path) size_str = format_size(dir_size) dir_list.append({ 'filename': item, 'real_name': item, 'size': size_str, 'raw_date': None, 'formatted_date': date_str, 'timestamp': int(dt.timestamp()), 'desc': desc, 'original_order': -1, 'is_dir': True }) else: disk_map_files[upper_item] = item # Only add if not already in PCBoard index (merged later) if upper_item not in indexed_filenames and not is_excluded(item): # Determine correct path for STATS (Size/Date). # If we are in the 'unpacked' web root, we should check if a source file exists # in 'unpacked.src' and use that instead. target_stat_path = full_item_path if full_path.startswith(UNPACKED_WEB_ROOT): # Calculate relative path from the web root, e.g. "games/doom" rel_from_web = os.path.relpath(full_path, UNPACKED_WEB_ROOT) # Construct potential source path: /home/ftp/unpacked.src/games/doom/file.txt possible_src = os.path.join(UNPACKED_SRC_ROOT, rel_from_web, item) if is_valid_file_object(possible_src): target_stat_path = possible_src stat_res = os.stat(target_stat_path) dt = datetime.datetime.fromtimestamp(stat_res.st_mtime) if dt.year > MAX_VALID_YEAR or int(dt.timestamp()) == int(FIXED_TIMESTAMP): date_str = "" else: date_str = dt.strftime("%Y-%m-%d") raw_desc = htaccess_desc.get(upper_item, "") desc_html = html.escape(raw_desc) file_list.append({ 'filename': item, 'real_name': item, 'size': str(stat_res.st_size), 'raw_date': None, 'formatted_date': date_str, 'timestamp': int(dt.timestamp()), 'desc': desc_html, 'original_order': 999999, 'is_dir': False }) # 2. Merge Lists final_list = [] dir_list.sort(key=lambda x: x['filename'].upper()) final_list.extend(dir_list) for entry in parsed_entries: fname_upper = entry['filename'].upper() if fname_upper in disk_map_files: entry['real_name'] = disk_map_files[fname_upper] entry['is_dir'] = False final_list.append(entry) file_list.sort(key=lambda x: x['filename'].upper()) final_list.extend(file_list) rel_path = os.path.relpath(full_path, ROOT_DIR) if rel_path == ".": rel_path = "" html_rows = [] # 3. Width Logic (110ch vs 132ch) # Default is 110ch (Fits legacy forum files) # CHECK FOR FORCED 110ch MODE based on curated directories forced_mode_keywords = {'software', 'hardware', 'skene', 'tlr'} force_110 = False # rel_path is already calculated and sanitized (empty string for root) if rel_path: first_part = rel_path.split(os.sep)[0].lower() if first_part in forced_mode_keywords: force_110 = True needs_wide = False # Only run dynamic check if NOT forced if not force_110: for item in final_list: real_name = item.get('real_name', item['filename']) # Check 1: Long Filename trigger (increased to 35) if len(real_name) > 35: needs_wide = True break # Check 2: Long Description trigger (increased to 50) if item['desc']: # Unescape so entities like " count as 1 char # Also strip whitespace from check since we will strip it in display clean_d = html.unescape(re.sub(r'<[^>]+>', '', item['desc'])) longest_line = max(len(line.rstrip()) for line in clean_d.splitlines()) if clean_d else 0 if longest_line > 50: needs_wide = True break frame_style = "width: 132ch;" if needs_wide else "width: 110ch;" # 4. Generate HTML Rows for item in final_list: real_name = item.get('real_name', item['filename']) if 'formatted_date' in item: display_date = item['formatted_date'] timestamp = item['timestamp'] else: display_date, timestamp = parse_date(item['raw_date']) if timestamp == int(FIXED_TIMESTAMP): display_date = "" if item.get('is_dir', False): desc_html = html.escape(item['desc']) else: better_desc = get_better_desc(rel_path, real_name) if better_desc: desc_html = better_desc else: desc_html = item['desc'] display_filename = safe_str(item['filename']) href_link = safe_href(real_name) + ("/" if item.get('is_dir', False) else "") if item.get('is_dir', False): unpack_html = '[ DIR ]' else: unpack_url, unpack_label = get_unpack_url(rel_path, real_name) if unpack_url: unpack_html = f'{unpack_label}' else: unpack_html = "" data_bytes = item['size'] if item.get('is_dir', False): sort_bytes = 0 else: sort_bytes = data_bytes row = f""" {display_filename} {unpack_html} {item['size']} {display_date} {desc_html} """ html_rows.append(row) # 5. Write Output out_path = os.path.join(full_path, "index.html") # Generate C:\ style path for window title raw_path_display = "C:\\" if (not rel_path or rel_path == ".") else "C:\\" + safe_str(rel_path.replace("/", "\\").upper()) # BRANDING LOGIC # Check if we are in the 'skene' tree (either standard or unpacked) # rel_path is relative to ROOT_DIR (/home/ftp) # e.g. 'skene', 'skene/demo', 'unpacked/skene', 'unpacked/skene/demo' check_path = rel_path.lower().replace('\\', '/') is_starport = False if check_path == 'skene' or check_path.startswith('skene/'): is_starport = True elif check_path == 'unpacked/skene' or check_path.startswith('unpacked/skene/'): is_starport = True if is_starport: header_title = "Starport BBS" header_logo = "/starportlogo-500x242.png" header_alt = "Starport BBS" else: header_title = "Metropoli BBS" header_logo = "/mpolilogo2-transparent-500x500.png" header_alt = "Metropoli BBS" with open(out_path, 'w', encoding='utf-8') as f: bc_html = generate_breadcrumb(rel_path) # Prepare the header content header_content = HTML_HEADER.format( raw_path=raw_path_display, breadcrumb_html=bc_html, context_link_html=context_link_html, frame_style=frame_style, header_title=header_title, header_logo=header_logo, header_alt=header_alt ) # FINAL SANITIZATION: force_clean ensures no surrogates exist before writing f.write(force_clean(header_content)) f.write("".join(html_rows)) f.write(HTML_FOOTER.format(file_count=len(final_list))) update_folder_timestamps(full_path, out_path) if __name__ == "__main__": for subdir, dirs, files in os.walk(ROOT_DIR, topdown=False): # Exclude unpacked.src completely from traversal if "unpacked.src" in subdir: continue # Modify dirs in-place to prevent os.walk from entering excluded dirs next dirs[:] = [d for d in dirs if not is_excluded(d)] generate_folder_index(subdir)