#!/usr/bin/env python3
"""
Metropoli BBS - PCBIndex to HTML Converter (Retro Edition)
==========================================================
This script generates static HTML index files for a directory tree, mimicking
the look and feel of a DOS-era BBS file listing.
Features:
- Recursive directory traversal (bottom-up).
- Parses legacy PCBoard file descriptions (_INDEX_ / 00_INDEX.TXT).
- Extracts and cleans descriptions from FILE_ID.DIZ in unpacked archives.
(Reads from .src folder, links to web folder).
- Unified "Retro Text" pipeline:
- Strips ANSI codes and PCB Color Codes (@Xnn) at the byte level.
- Detects encodings: SF7 (Finnish 7-bit), CP437 (DOS), UTF-8, Latin1.
- Transcodes everything to valid UTF-8 HTML.
- STRICT Whitespace Handling:
- Removes trailing whitespace.
- Removes vertical padding (empty lines at start/end).
- PRESERVES leading whitespace (indentation) for ASCII art.
- Responsive retro styling:
- Uses embedded "Perfect DOS" web font (locally hosted in /retrofonts/).
- Dynamic width (110ch vs 132ch) based on content length.
- Forces 110ch width for specific curated directories.
- Advanced breadcrumb navigation for unpacked trees (Archive boundary detection).
"""
import os
import re
import html
import datetime
import urllib.parse
import fnmatch
import math
import stat
# =============================================================================
# CONFIGURATION
# =============================================================================
ROOT_DIR = "/home/ftp"
# Destination for clickable links (HTML versions generated by viewer)
UNPACKED_WEB_ROOT = "/home/ftp/unpacked"
# Source for reading metadata (Raw files)
UNPACKED_SRC_ROOT = "/home/ftp/unpacked.src"
# Files and directories to exclude from the generated index.
# CASE INSENSITIVE matching.
EXCLUDED_PATTERNS = {
".*", # Hidden files
"_INDEX*", # PCBoard Index source files
"INDEX.*",
"00_INDEX.TXT",
"CORE", # Linux core dumps
"*.desc", # Internal description files
"mpolilogo*.*", # Website assets
"starportlogo*", # Website assets (Starport variant)
"index-style",
"favicon.ico",
"files",
"sitemap*.txt",
"robots.txt",
"pub",
"DESCRIPT.ION", # 4DOS descriptions
"_index*",
"index.html", # The file we are generating
"index.htm",
"retrofonts", # Exclude font directory
"unpacked.src" # Exclude raw source of unpacked files from indexing
}
# Mapping file extensions to their BBS-era archiver commands.
# Spaces here are preserved in the HTML via 'white-space: pre'.
ARCHIVER_MAP = {
'.ZIP': '[ PKUNZIP ]',
'.ARJ': '[ ARJ x ]',
'.LZH': '[ LHARC e ]',
'.LHA': '[ LHARC e ]',
'.RAR': '[ UNRAR ]',
'.ZOO': '[ ZOO x ]',
'.ARC': '[ ARC e ]',
'.PAK': '[ PAK e ]',
'.EXE': '[ UNPACK ]'
}
# 1990-01-01 00:00:00 Timestamp.
# Used as a fallback for missing dates or to clamp "future" dates.
FIXED_TIMESTAMP = 631152000.0
# Cutoff Year for Date Sanity Check.
# Files newer than this (e.g., generated logs) get clamped to 1990 styling.
MAX_VALID_YEAR = 2015
# =============================================================================
# RETRO TEXT PROCESSING LOGIC
# =============================================================================
# Regex to identify ANSI escape sequences (colors, cursor moves).
ANSI_BYTES_REGEX = re.compile(rb'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
# Regex to identify PCBoard Color Codes (e.g., @X0A, @X1F).
# Matches @X followed by two hex digits. Case insensitive.
PCB_BYTES_REGEX = re.compile(rb'@X[0-9A-F]{2}', re.IGNORECASE)
# Standard Word Regex (2+ letters) used for text density analysis.
WORD_REGEX = re.compile(rb'\b[a-zA-Z]{2,}\b')
def is_binary_safe(content):
"""
Determines if a byte string is likely binary garbage rather than text.
Checks for high density of control codes (excluding tabs/newlines).
"""
if not content: return False
# Check for bytes < 9 (excluding Null sometimes used in text padding)
# and 14-26 (Shift Out, DLE, etc).
bad = sum(1 for b in content if (b < 9 and b != 0) or (14 <= b <= 26))
return bad > (len(content) * 0.10)
def detect_sf7(raw_bytes):
"""
Detects Finnish SF7 (ISO 646-FI) encoding.
SF7 uses 7-bit chars (e.g. '{') to represent Scandis.
If high-bit chars (>127) exist, it is strictly NOT SF7.
"""
total_len = len(raw_bytes)
if total_len == 0: return False
# SAFETY 1: SF7 is a 7-bit encoding. If we see 8-bit bytes (CP437 blocks, Latin1 chars),
# it cannot be SF7. This protects CP437 art and Latin1 text.
if any(b > 127 for b in raw_bytes):
return False
# SAFETY 2: Word Density.
# We relax this for short descriptions (under 100 bytes) because
# "Very good!" has high density, but "[]" has low.
if total_len > 100:
total_words_list = WORD_REGEX.findall(raw_bytes)
total_words = len(total_words_list)
if (total_words / total_len) < 0.05: return False
# Heuristic: Ratio of '{' (ä) to normal letters.
sf7_marker_count = raw_bytes.count(b'{')
letter_count = sum(1 for b in raw_bytes if (65 <= b <= 90) or (97 <= b <= 122))
if letter_count == 0: return False
# Threshold: '{' must represent > 2.5% of all letters.
return (sf7_marker_count / letter_count) > 0.025
def detect_encoding(raw_bytes, source_filename=""):
"""
Determines the likely encoding of the byte stream.
Prioritizes: SF7 -> ASCII -> UTF-8 -> CP437 -> Latin1.
"""
# 0. Special Case: 00_INDEX.TXT is known to be CP437 (or ASCII).
# Never SF7.
is_00_index = "00_INDEX" in source_filename.upper()
# 1. Check for SF7
# Skip if source is 00_INDEX or if detection fails
if not is_00_index and detect_sf7(raw_bytes):
return 'SF7'
# 2. Check for Pure ASCII (7-bit clean)
if not any(b > 127 for b in raw_bytes): return 'ASCII'
# 3. Check for valid UTF-8
try:
raw_bytes.decode('utf-8')
return 'UTF-8'
except UnicodeDecodeError: pass
# 4. Fallback: Distinguish CP437 (DOS Art) from Latin1 (Windows/Unix)
cp437_score = 0
latin1_score = 0
# If it's 00_INDEX.TXT, we bias heavily towards CP437
if is_00_index:
cp437_score += 100
for b in raw_bytes:
if 0x80 <= b <= 0x9F: cp437_score += 5 # CP437 block drawing chars
elif 0xB0 <= b <= 0xDF: cp437_score += 1 # CP437 shading blocks
elif b in [0xC4, 0xD6, 0xC5, 0xE4, 0xF6, 0xE5]: latin1_score += 3 # Scandi Latin1
elif b == 0xA2: cp437_score += 2 # Cent symbol
return 'CP437' if cp437_score >= latin1_score else 'LATIN1'
def transcode_text(raw_bytes, encoding):
"""
Converts the raw bytes from the detected source encoding into a clean UTF-8 string.
"""
enc = encoding.lower()
if enc == 'sf7':
text = raw_bytes.decode('ascii', errors='replace')
# Map SF7 chars to Unicode
return text.translate(str.maketrans({
'[': 'Ä', '\\': 'Ö', ']': 'Å',
'{': 'ä', '|': 'ö', '}': 'å',
'~': 'ü'
}))
elif enc == 'cp437': return raw_bytes.decode('cp437', errors='replace')
elif enc == 'utf-8': return raw_bytes.decode('utf-8', errors='replace')
elif enc == 'ascii': return raw_bytes.decode('ascii', errors='replace')
else: return raw_bytes.decode('latin1', errors='replace')
def process_retro_text(raw_bytes, source_filename=""):
"""
Master function to clean and convert description text.
1. Strips ANSI & PCB codes (bytes) and DOS EOF.
2. Detects encoding & Transcodes (UTF-8).
3. Trims whitespace (safe for ASCII art).
4. Escapes HTML.
"""
if not raw_bytes: return ""
# Strip ANSI escape sequences
clean_bytes = ANSI_BYTES_REGEX.sub(b'', raw_bytes)
# Strip PCBoard Color Codes (@Xnn)
clean_bytes = PCB_BYTES_REGEX.sub(b'', clean_bytes)
# Strip DOS EOF Char (0x1A / 26) - Common in old text files
clean_bytes = clean_bytes.replace(b'\x1a', b'')
# Safety check
if is_binary_safe(clean_bytes): return "[Binary Data]"
encoding = detect_encoding(clean_bytes, source_filename=source_filename)
utf8_text = transcode_text(clean_bytes, encoding)
# Whitespace Cleanup:
# 1. Right-Strip EVERY line (Remove trailing whitespace)
# We do NOT lstrip() here to preserve indentation/centering.
lines = [line.rstrip() for line in utf8_text.splitlines()]
# 2. Vertical Trim (Remove empty lines from Start)
while lines and not lines[0]:
lines.pop(0)
# 3. Vertical Trim (Remove empty lines from End)
while lines and not lines[-1]:
lines.pop()
# 4. Rejoin with clean newlines (strip vertical padding of result)
final_text = "\n".join(lines)
return html.escape(final_text)
# =============================================================================
# HTML TEMPLATES
# =============================================================================
HTML_HEADER = """
{header_title} - {raw_path}
| FILENAME |
|
SIZE |
DATE |
DESCRIPTION |
"""
HTML_FOOTER = """
TOTAL: {file_count}
"""
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def safe_str(s):
"""
Encodes string to UTF-8 properly for HTML injection.
Handles 'surrogateescape' artifacts from os.listdir (e.g., lone surrogates like \udcaf)
by reverting them to bytes and decoding with replacement characters.
"""
if s is None: return ""
try:
# Step 1: Revert 'surrogateescape' decoding (common in os.listdir on Linux)
# to get original raw bytes.
raw_bytes = s.encode('utf-8', 'surrogateescape')
except UnicodeError:
# If surrogates are somehow complex or mixed, force replacement during encode
raw_bytes = s.encode('utf-8', 'replace')
# Step 2: Decode as UTF-8 with replacement for errors (invalid UTF-8 bytes)
# This guarantees the result is a valid UTF-8 string without surrogates.
return raw_bytes.decode('utf-8', 'replace')
def force_clean(s):
"""
Last-resort sanitizer used before file writing.
Ensures absolute UTF-8 validity by stripping any lingering surrogates.
"""
if not s: return ""
return s.encode('utf-8', 'replace').decode('utf-8')
def safe_href(s):
"""URL-encodes a path component."""
if s is None: return ""
return urllib.parse.quote(os.fsencode(s))
def is_excluded(filename):
"""Checks if a filename matches any of the exclusion patterns."""
fname = filename.upper()
for pattern in EXCLUDED_PATTERNS:
if fnmatch.fnmatch(fname, pattern.upper()):
return True
return False
def is_valid_file_object(path):
"""Checks if path is a regular file or directory (and exists)."""
try:
st = os.lstat(path)
if stat.S_ISLNK(st.st_mode): return False
if stat.S_ISREG(st.st_mode) or stat.S_ISDIR(st.st_mode): return True
return False
except OSError:
return False
def is_effectively_empty(path):
"""
Recursively checks if a directory is empty or contains only excluded files.
"""
for root, dirs, files in os.walk(path):
dirs[:] = [d for d in dirs if is_valid_file_object(os.path.join(root, d))]
for f in files:
fp = os.path.join(root, f)
if not is_valid_file_object(fp): continue
if not is_excluded(f): return False
return True
def read_file_bytes_safe(path):
"""Attempts to read a file as raw bytes. Returns None on failure."""
try:
with open(path, 'rb') as f:
return f.read()
except Exception:
return None
def format_size(size_bytes):
"""Formats bytes into human readable string strictly with 2 decimals."""
if size_bytes == 0: return "0.00 B"
size_name = ("B", "KB", "MB", "GB", "TB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = size_bytes / p
return "%.2f %s" % (s, size_name[i])
def get_folder_size(start_path):
"""Calculates total size of a directory (excluding excluded files)."""
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
if not is_excluded(f):
fp = os.path.join(dirpath, f)
if is_valid_file_object(fp):
try:
total_size += os.path.getsize(fp)
except OSError:
pass
return total_size
def backup_existing_indices(full_path):
"""
Backs up existing index.html files if they weren't generated by this tool.
"""
targets = ["index.html", "index.htm", "INDEX.HTML", "INDEX.HTM"]
our_markers = [b'content="Metropoli-PCB-Indexer"']
for target in targets:
target_path = os.path.join(full_path, target)
if is_valid_file_object(target_path):
is_ours = False
try:
with open(target_path, 'rb') as f:
head = f.read(4096)
for marker in our_markers:
if marker in head:
is_ours = True
break
except:
pass
if not is_ours:
new_name = "_" + target
new_path = os.path.join(full_path, new_name)
try:
os.replace(target_path, new_path)
except OSError:
pass
def get_better_desc(rel_dir_path, filename):
"""
Looks for a FILE_ID.DIZ inside the corresponding unpacked directory.
Uses process_retro_text to clean it up.
CRITICAL: Reads metadata from UNPACKED_SRC_ROOT (raw text).
"""
lower_rel = rel_dir_path.lower()
lower_filename = filename.lower()
# Check src path for reading
src_path = os.path.join(UNPACKED_SRC_ROOT, lower_rel, lower_filename)
# We check if the folder exists in source to try and read the DIZ
if not os.path.isdir(src_path) or os.path.islink(src_path):
return None
candidates = ["file_id.old", "FILE_ID.OLD", "file_id.diz", "FILE_ID.DIZ"]
for candidate in candidates:
candidate_path = os.path.join(src_path, candidate)
if is_valid_file_object(candidate_path):
raw_content = read_file_bytes_safe(candidate_path)
if raw_content:
# DIZ files are usually CP437 or SF7, treated generally
desc_html = process_retro_text(raw_content, source_filename=candidate)
# DO NOT STRIP() HERE! strict cleanup is already done in process_retro_text
return desc_html
return None
def generate_breadcrumb(rel_path):
"""
Generates the navigation breadcrumb HTML in 'C:\DIR' style.
Handles 'unpacked' directories specially to redirect intermediate
paths back to the compressed file index and HIDE the 'UNPACKED' level.
"""
if not rel_path or rel_path == ".":
return 'C:'
parts = rel_path.split(os.sep)
html_parts = []
# Root Link
up_levels = len(parts)
root_link = "../" * up_levels
html_parts.append(f'C:')
# Detect if we are inside the 'unpacked' tree
is_unpacked = (parts[0].lower() == 'unpacked')
# Flag to track if we have crossed the boundary from "Folder Structure" to "Inside Zip"
inside_archive_boundary = False
for i, part in enumerate(parts):
# Handle "UNPACKED" word - Skip it entirely from display
if is_unpacked and i == 0:
continue
levels_up = len(parts) - 1 - i
# Link construction logic for unpacked tree
if is_unpacked:
# Reconstruct the path from ROOT to the current part (skipping 'unpacked')
source_segments = parts[1:i+1]
# Resolve this path in ROOT_DIR case-insensitively to determine
# if it points to a real Source Directory or Source File (Archive).
# This handles case mismatches (e.g. 'games' vs 'GAMES') which
# prevents 'os.path.isdir' from working on Linux.
check_path = ROOT_DIR
resolved_segments = []
valid_source = True
for seg in source_segments:
found = False
if os.path.isdir(check_path):
try:
for entry in os.listdir(check_path):
if entry.lower() == seg.lower():
check_path = os.path.join(check_path, entry)
resolved_segments.append(entry)
found = True
break
except OSError: pass
if not found:
valid_source = False
break
is_source_file = False
is_source_dir = False
if valid_source:
if os.path.isfile(check_path): is_source_file = True
elif os.path.isdir(check_path): is_source_dir = True
# Determine target URL based on whether we are looking at a Directory or a File (Archive)
if not inside_archive_boundary:
if is_source_file:
# It IS the archive file (e.g. doom.zip).
# This is the boundary. Link to the Unpacked Index (self).
inside_archive_boundary = True
# Use source_segments (current unpacked path context) for unpacked links
target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html"
elif is_source_dir:
# It is a directory in the source tree (e.g. GAMES).
# Link to the Compressed Index (Source).
# Use resolved_segments to ensure we link to the correct Case on disk (e.g. GAMES not games)
target_url = ("../" * len(parts)) + "/".join(safe_href(p) for p in resolved_segments) + "/index.html"
else:
# Fallback (Inside archive or path mismatch)
target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html"
else:
# We are ALREADY inside the archive (e.g. DATA).
# Link to the Unpacked Index (subdir).
target_url = ("../" * len(parts)) + "unpacked/" + "/".join(safe_href(p) for p in source_segments) + "/index.html"
html_parts.append(f'{safe_str(part.upper())}')
else:
# Standard logic for normal tree
link_target = "../" * levels_up + "index.html"
if levels_up == 0: link_target = "index.html"
html_parts.append(f'{safe_str(part.upper())}')
# Join with backslash, no spaces
return '' + '\\'.join(html_parts) + ''
def parse_date(date_str):
"""Parses MM-DD-YY date string into standardized YYYY-MM-DD."""
try:
m, d, y = date_str.split('-')
y_int = int(y)
full_year = 1900 + y_int if y_int > 50 else 2000 + y_int
dt = datetime.datetime(full_year, int(m), int(d))
return dt.strftime("%Y-%m-%d"), int(dt.timestamp())
except:
return date_str, 0
def parse_htaccess(full_path):
"""
Parses .htaccess for AddDescription fields.
"""
htaccess_path = os.path.join(full_path, ".htaccess")
desc_map = {}
if is_valid_file_object(htaccess_path):
content_bytes = read_file_bytes_safe(htaccess_path)
if content_bytes:
# .htaccess is usually simple text
try: content = content_bytes.decode('utf-8')
except UnicodeDecodeError: content = content_bytes.decode('latin-1', errors='replace')
lines = content.splitlines()
regex = re.compile(r'^\s*AddDescription\s+"([^"]+)"\s+"(?:\*/)?([^"]+)"', re.IGNORECASE)
for line in lines:
m = regex.search(line)
if m: desc_map[m.group(2).upper()] = m.group(1)
return desc_map
def parse_pcboard_index(index_path):
"""
Parses a legacy PCBoard _INDEX_ file.
"""
entries = []
current_entry = None
if not is_valid_file_object(index_path): return []
content_bytes = read_file_bytes_safe(index_path)
if not content_bytes: return []
# Get filename for detection logic (00_INDEX check)
index_filename = os.path.basename(index_path)
lines = content_bytes.splitlines()
line_regex = re.compile(rb'^([A-Z0-9_\-\.]{1,12})\s+(\d+)\s+(\d{2}-\d{2}-\d{2})\s+(.*)')
for idx, line in enumerate(lines):
line = line.rstrip()
if not line: continue
match = line_regex.match(line)
if match:
if current_entry: entries.append(current_entry)
f_name = match.group(1).decode('ascii', errors='ignore')
f_size = match.group(2).decode('ascii', errors='ignore')
f_date = match.group(3).decode('ascii', errors='ignore')
raw_desc = match.group(4)
current_entry = {
'filename': f_name,
'size': f_size,
'raw_date': f_date,
'raw_desc_lines': [raw_desc],
'original_order': idx
}
elif current_entry:
# Handle Continuation Lines
# We must detect if it's a pipe-separated continuation (standard)
# or just a wrapped line.
# 1. Check if the line *visually* starts with a pipe
if line.lstrip().startswith(b'|'):
# Locate the pipe char
pidx = line.find(b'|')
# Content is everything after the pipe.
# We RSTRIP to remove trailing space, but we do NOT LSTRIP the content
# to preserve any intended indentation (ASCII art).
content = line[pidx+1:].rstrip()
current_entry['raw_desc_lines'].append(content)
else:
# Fallback: No pipe found. We have to assume it's just text.
# In this case we strip to be safe, as indentation without a delimiter is unreliable.
current_entry['raw_desc_lines'].append(line.strip())
if current_entry: entries.append(current_entry)
# Process descriptions
for e in entries:
full_desc_bytes = b'\n'.join(e['raw_desc_lines'])
# Pass the index filename so process_retro_text knows if it's 00_INDEX.TXT
e['desc'] = process_retro_text(full_desc_bytes, source_filename=index_filename)
return entries
def get_unpack_url(rel_dir_path, filename):
"""
Checks if an archive has a corresponding unpacked folder.
Links to the WEB root (HTML version).
"""
lower_rel = rel_dir_path.lower()
lower_filename = filename.lower()
# Check Web path for linking
web_disk_path = os.path.join(UNPACKED_WEB_ROOT, lower_rel, lower_filename)
# We check if the folder exists (even if we ignore it for indexing)
if os.path.isdir(web_disk_path) and not os.path.islink(web_disk_path):
_, ext = os.path.splitext(filename.upper())
label = ARCHIVER_MAP.get(ext, '[ UNPACK ]')
url_path = f"/unpacked/{safe_href(lower_rel)}/{safe_href(lower_filename)}/"
return url_path, label
return None, None
def get_context_link_html(full_path):
"""
Links from unpacked view back to original file.
Adjusted to handle checking within WEB root.
"""
abs_unpacked = os.path.abspath(UNPACKED_WEB_ROOT)
abs_current = os.path.abspath(full_path)
if abs_current.startswith(abs_unpacked) and abs_current != abs_unpacked:
rel_from_unpacked = os.path.relpath(abs_current, abs_unpacked)
path_parts = rel_from_unpacked.split(os.sep)
current_phys_path = ROOT_DIR
matched_url_parts = []
for part in path_parts:
found_name = None
try:
if os.path.isdir(current_phys_path):
for item in os.listdir(current_phys_path):
if item.lower() == part.lower():
found_name = item
break
except OSError: break
if found_name:
next_phys_path = os.path.join(current_phys_path, found_name)
if os.path.isfile(next_phys_path):
if not matched_url_parts:
href = "/"
else:
href = "/" + "/".join(safe_href(p) for p in matched_url_parts) + "/"
return f''
current_phys_path = next_phys_path
matched_url_parts.append(found_name)
else:
break
return ""
def update_folder_timestamps(full_path, index_file_path):
"""
Updates the folder timestamp to match the newest file inside it.
Clamps dates > MAX_VALID_YEAR.
CRITICAL: This ignores 'index.html', 'index.htm', and '_INDEX_'
files to ensure the directory timestamp reflects content, not the
generated index itself.
"""
try:
# 1. Reset timestamps of excluded items to fixed date
# EXCEPTION: We do NOT reset index.html here, allowing it to stay "Current"
try:
immediate_items = os.listdir(full_path)
for item in immediate_items:
# Skip touching the index files we just generated so they stay fresh
if item.lower() in ["index.html", "index.htm", "_index_"]:
continue
if is_excluded(item):
item_path = os.path.join(full_path, item)
if is_valid_file_object(item_path):
try:
os.utime(item_path, (FIXED_TIMESTAMP, FIXED_TIMESTAMP))
except OSError: pass
except OSError: pass
# 2. Find newest content file
newest_ts = 0.0
for root, dirs, files in os.walk(full_path):
dirs[:] = [d for d in dirs if not is_excluded(d) and is_valid_file_object(os.path.join(root, d))]
for f in files:
# Explicitly skip index files from calculation
if f.lower() in ["index.html", "index.htm", "_index_"]:
continue
if is_excluded(f): continue
file_path = os.path.join(root, f)
if file_path == index_file_path: continue
if is_valid_file_object(file_path):
try:
stat_res = os.stat(file_path)
# Sanity Check
file_year = datetime.datetime.fromtimestamp(stat_res.st_mtime).year
if file_year > MAX_VALID_YEAR:
# If file is too new (e.g. logs), ignore it for sorting
# or clamp it. Here we clamp it to keep folder retro.
current_ts = FIXED_TIMESTAMP
else:
current_ts = stat_res.st_mtime
if current_ts > newest_ts:
newest_ts = current_ts
except OSError: pass
for d in dirs:
dir_path = os.path.join(root, d)
if is_valid_file_object(dir_path):
try:
stat_res = os.stat(dir_path)
if stat_res.st_mtime > newest_ts:
newest_ts = stat_res.st_mtime
except OSError: pass
if newest_ts > 0:
os.utime(full_path, (newest_ts, newest_ts))
else:
os.utime(full_path, (FIXED_TIMESTAMP, FIXED_TIMESTAMP))
except OSError as e:
print(f"Timestamp update failed for {full_path}: {e}")
# =============================================================================
# MAIN LOGIC
# =============================================================================
def generate_folder_index(full_path):
print(f"Processing: {safe_str(full_path)}")
backup_existing_indices(full_path)
# Locate index source
index_file = os.path.join(full_path, "_INDEX_")
if not is_valid_file_object(index_file):
alt_index = os.path.join(full_path, "00_index.txt")
if is_valid_file_object(alt_index): index_file = alt_index
# Parse sources
parsed_entries = parse_pcboard_index(index_file)
indexed_filenames = {e['filename'].upper() for e in parsed_entries}
htaccess_desc = parse_htaccess(full_path)
try: actual_items = os.listdir(full_path)
except OSError: return
disk_map_files = {}
dir_list = []
file_list = []
context_link_html = get_context_link_html(full_path)
# 1. Process Directories & Files on Disk
for item in actual_items:
full_item_path = os.path.join(full_path, item)
# HIDE UNPACKED FOLDERS FROM TOP LEVEL
if full_path == ROOT_DIR and item.lower() in ['unpacked', 'unpacked.src']:
continue
upper_item = item.upper()
if not is_valid_file_object(full_item_path): continue
if os.path.isdir(full_item_path):
if not is_excluded(item):
if is_effectively_empty(full_item_path): continue
stat_res = os.stat(full_item_path)
dt = datetime.datetime.fromtimestamp(stat_res.st_mtime)
if dt.year > MAX_VALID_YEAR or int(dt.timestamp()) == int(FIXED_TIMESTAMP):
date_str = ""
else:
date_str = dt.strftime("%Y-%m-%d")
desc = htaccess_desc.get(upper_item, "")
dir_size = get_folder_size(full_item_path)
size_str = format_size(dir_size)
dir_list.append({
'filename': item, 'real_name': item, 'size': size_str,
'raw_date': None, 'formatted_date': date_str,
'timestamp': int(dt.timestamp()), 'desc': desc,
'original_order': -1, 'is_dir': True
})
else:
disk_map_files[upper_item] = item
# Only add if not already in PCBoard index (merged later)
if upper_item not in indexed_filenames and not is_excluded(item):
# Determine correct path for STATS (Size/Date).
# If we are in the 'unpacked' web root, we should check if a source file exists
# in 'unpacked.src' and use that instead.
target_stat_path = full_item_path
if full_path.startswith(UNPACKED_WEB_ROOT):
# Calculate relative path from the web root, e.g. "games/doom"
rel_from_web = os.path.relpath(full_path, UNPACKED_WEB_ROOT)
# Construct potential source path: /home/ftp/unpacked.src/games/doom/file.txt
possible_src = os.path.join(UNPACKED_SRC_ROOT, rel_from_web, item)
if is_valid_file_object(possible_src):
target_stat_path = possible_src
stat_res = os.stat(target_stat_path)
dt = datetime.datetime.fromtimestamp(stat_res.st_mtime)
if dt.year > MAX_VALID_YEAR or int(dt.timestamp()) == int(FIXED_TIMESTAMP):
date_str = ""
else:
date_str = dt.strftime("%Y-%m-%d")
raw_desc = htaccess_desc.get(upper_item, "")
desc_html = html.escape(raw_desc)
file_list.append({
'filename': item, 'real_name': item, 'size': str(stat_res.st_size),
'raw_date': None, 'formatted_date': date_str,
'timestamp': int(dt.timestamp()), 'desc': desc_html,
'original_order': 999999, 'is_dir': False
})
# 2. Merge Lists
final_list = []
dir_list.sort(key=lambda x: x['filename'].upper())
final_list.extend(dir_list)
for entry in parsed_entries:
fname_upper = entry['filename'].upper()
if fname_upper in disk_map_files:
entry['real_name'] = disk_map_files[fname_upper]
entry['is_dir'] = False
final_list.append(entry)
file_list.sort(key=lambda x: x['filename'].upper())
final_list.extend(file_list)
rel_path = os.path.relpath(full_path, ROOT_DIR)
if rel_path == ".": rel_path = ""
html_rows = []
# 3. Width Logic (110ch vs 132ch)
# Default is 110ch (Fits legacy forum files)
# CHECK FOR FORCED 110ch MODE based on curated directories
forced_mode_keywords = {'software', 'hardware', 'skene', 'tlr'}
force_110 = False
# rel_path is already calculated and sanitized (empty string for root)
if rel_path:
first_part = rel_path.split(os.sep)[0].lower()
if first_part in forced_mode_keywords:
force_110 = True
needs_wide = False
# Only run dynamic check if NOT forced
if not force_110:
for item in final_list:
real_name = item.get('real_name', item['filename'])
# Check 1: Long Filename trigger (increased to 35)
if len(real_name) > 35:
needs_wide = True
break
# Check 2: Long Description trigger (increased to 50)
if item['desc']:
# Unescape so entities like " count as 1 char
# Also strip whitespace from check since we will strip it in display
clean_d = html.unescape(re.sub(r'<[^>]+>', '', item['desc']))
longest_line = max(len(line.rstrip()) for line in clean_d.splitlines()) if clean_d else 0
if longest_line > 50:
needs_wide = True
break
frame_style = "width: 132ch;" if needs_wide else "width: 110ch;"
# 4. Generate HTML Rows
for item in final_list:
real_name = item.get('real_name', item['filename'])
if 'formatted_date' in item:
display_date = item['formatted_date']
timestamp = item['timestamp']
else:
display_date, timestamp = parse_date(item['raw_date'])
if timestamp == int(FIXED_TIMESTAMP):
display_date = ""
if item.get('is_dir', False):
desc_html = html.escape(item['desc'])
else:
better_desc = get_better_desc(rel_path, real_name)
if better_desc:
desc_html = better_desc
else:
desc_html = item['desc']
display_filename = safe_str(item['filename'])
href_link = safe_href(real_name) + ("/" if item.get('is_dir', False) else "")
if item.get('is_dir', False):
unpack_html = '[ DIR ]'
else:
unpack_url, unpack_label = get_unpack_url(rel_path, real_name)
if unpack_url:
unpack_html = f'{unpack_label}'
else:
unpack_html = ""
data_bytes = item['size']
if item.get('is_dir', False): sort_bytes = 0
else: sort_bytes = data_bytes
row = f"""
| {display_filename} |
{unpack_html} |
{item['size']} |
{display_date} |
{desc_html} |
"""
html_rows.append(row)
# 5. Write Output
out_path = os.path.join(full_path, "index.html")
# Generate C:\ style path for window title
raw_path_display = "C:\\" if (not rel_path or rel_path == ".") else "C:\\" + safe_str(rel_path.replace("/", "\\").upper())
# BRANDING LOGIC
# Check if we are in the 'skene' tree (either standard or unpacked)
# rel_path is relative to ROOT_DIR (/home/ftp)
# e.g. 'skene', 'skene/demo', 'unpacked/skene', 'unpacked/skene/demo'
check_path = rel_path.lower().replace('\\', '/')
is_starport = False
if check_path == 'skene' or check_path.startswith('skene/'):
is_starport = True
elif check_path == 'unpacked/skene' or check_path.startswith('unpacked/skene/'):
is_starport = True
if is_starport:
header_title = "Starport BBS"
header_logo = "/starportlogo-500x242.png"
header_alt = "Starport BBS"
else:
header_title = "Metropoli BBS"
header_logo = "/mpolilogo2-transparent-500x500.png"
header_alt = "Metropoli BBS"
with open(out_path, 'w', encoding='utf-8') as f:
bc_html = generate_breadcrumb(rel_path)
# Prepare the header content
header_content = HTML_HEADER.format(
raw_path=raw_path_display,
breadcrumb_html=bc_html,
context_link_html=context_link_html,
frame_style=frame_style,
header_title=header_title,
header_logo=header_logo,
header_alt=header_alt
)
# FINAL SANITIZATION: force_clean ensures no surrogates exist before writing
f.write(force_clean(header_content))
f.write("".join(html_rows))
f.write(HTML_FOOTER.format(file_count=len(final_list)))
update_folder_timestamps(full_path, out_path)
if __name__ == "__main__":
for subdir, dirs, files in os.walk(ROOT_DIR, topdown=False):
# Exclude unpacked.src completely from traversal
if "unpacked.src" in subdir:
continue
# Modify dirs in-place to prevent os.walk from entering excluded dirs next
dirs[:] = [d for d in dirs if not is_excluded(d)]
generate_folder_index(subdir)