Add HTML smuggling analyzer, surface macro detail, split per-type modules

2026-05-05 06:01:58 -07:00
parent 5dbcf6892d
commit a609527c0b
10 changed files with 940 additions and 96 deletions
@@ -98,6 +98,13 @@ All notable changes to this project will be documented in this file.
 - Elastic YARA rules synced to upstream `d131ea8` (2026-04-30, 686 rules — 684 upstream + Morpes/Torii retained locally after Elastic rotated them out)
 - YARA-Forge bumped to 0.9.1 (release `20260503`, 2026-05-03) — separate `YARAForge_Extended.yar` pack alongside the Elastic rules

+### File-type analyzers
+- HTML smuggling analyzer (`app/utils/htmlsmuggle.py`) — pattern set + scoring model ported from RootUp/SmuggleShield. Runs at upload time on `.html` / `.htm` files. Catches in-page payload assembly (atob → Uint8Array → Blob → URL.createObjectURL → `<a download>` click), GWT smuggling artifacts, WebAssembly drop chains, dataset-driven payload tags, and ~80 other regex signatures. Output lands in `file_info.html_smuggle_info`.
+- Office macro detail surfacing — the existing olevba pipeline now exposes per-module VBA source, autoexec triggers, suspicious keyword hits, and IOCs as structured tables on the upload-result page (was previously only a one-line "5 auto-execution triggers detected" summary).
+- T1221 Remote Template Injection detection — `_scan_external_relationships` walks every OOXML container's `*.rels` files looking for external `attachedTemplate` / `oleObject` / `subDocument` / `frame` references. Catches Atomic Red Team's `Calculator.docx` (and the wider class) where `has_macros: false` but the malicious VBA lives in a remote `.dotm`.
+- File-type analyzers split into dedicated modules — `utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`. `forensics.py` is now strictly PE / MalAPI / entropy. Re-exports preserved through `app/utils/__init__.py` so existing call sites keep working.
+- `allowed_extensions` expanded to cover macro-enabled Office (`docm`, `dotm`, `xlsm`, `xltm`), legacy CFBF binaries (`doc`, `xls`, `rtf`), and HTML (`html`, `htm`). Upload page now gates analysis tabs by file family: office + html files only show Static (Dynamic / EDR aren't relevant for these without an Office install on the target host); driver files keep the existing static-driver + HolyGrail flow.
+
 ### Notes
 - New runtime dependency: `requests==2.32.3`
 - Whiskers binary not committed — build via `cargo build --release` (see `Whiskers/BUILD.md`)
@@ -8,13 +8,26 @@ application:

 utils:
  allowed_extensions:
+    # Executables / loaders / drivers
    - exe
    - dll
    - bin
-    - docx
-    - xlsx
-    - lnk
    - sys
+    - lnk
+    # Word (OOXML + legacy CFBF)
+    - docx        # no macros per spec, but still uploadable for T1221 template injection samples
+    - docm        # macro-enabled
+    - dotm        # template macro-enabled (T1221 target)
+    - doc         # Word 97-2003 -- frequently weaponised with VBA macros
+    - rtf         # Rich Text Format -- OLE-embedded payloads, T1203 patterns
+    # Excel (OOXML + legacy CFBF)
+    - xlsx        # no macros per spec; still routed through olevba in case of XLM smuggling
+    - xlsm        # macro-enabled
+    - xltm        # template macro-enabled
+    - xls         # Excel 97-2003 -- carries VBA + Excel 4.0 / XLM macros
+    # HTML / HTML Application -- routed through the SmuggleShield-derived static analyzer
+    - html        # plain HTML
+    - htm         # alternate HTML extension
  max_file_size: 104857600  # 100MB in bytes
  upload_folder: "Uploads"
  result_folder: "Results"
@@ -64,6 +64,10 @@ document.addEventListener('DOMContentLoaded', function() {
        officeInfo: document.getElementById('officeInfo'),
        macroStatus: document.getElementById('macroStatus'),
        macroDetectionNotes: document.getElementById('macroDetectionNotes'),
+        htmlSmuggleInfo: document.getElementById('htmlSmuggleInfo'),
+        smuggleStatus: document.getElementById('smuggleStatus'),
+        smuggleDetectionNotes: document.getElementById('smuggleDetectionNotes'),
+        smuggleInfo: document.getElementById('smuggleInfo'),
        checksumInfo: document.getElementById('checksumInfo'),
        checksumStatus: document.getElementById('checksumStatus'),
        storedChecksum: document.getElementById('storedChecksum'),
@@ -193,19 +197,42 @@ document.addEventListener('DOMContentLoaded', function() {
    //
    // The analysis-mode selector is a single segmented control with one tab
    // per mode (Static / Dynamic / each EDR profile / HolyGrail). Each tab
-    // is tagged data-family="regular" or "driver"; we only show the family
-    // matching the uploaded file. The first visible tab becomes active.
+    // is tagged with one or more `data-family` values (space-separated) and
+    // only tabs matching the uploaded file's family are shown.
+    //
+    // Four families:
+    //   driver  -- .sys (-> static-driver, holygrail)
+    //   office  -- Word / Excel macro-bearing documents (-> static only;
+    //              dynamic / EDR don't make sense without an Office install
+    //              on the target host -- olevba is the relevant scanner)
+    //   html    -- .html / .htm (-> static only; SmuggleShield-derived
+    //              pattern analyzer runs at upload time as html_smuggle_info)
+    //   regular -- everything else (-> all / static / dynamic / edr:*)
+    const DRIVER_EXTS = new Set(['sys']);
+    const OFFICE_EXTS = new Set([
+        'docx', 'docm', 'dotm', 'doc', 'rtf',
+        'xlsx', 'xlsm', 'xltm', 'xls',
+    ]);
+    const HTML_EXTS = new Set(['html', 'htm']);
+
    function updateAnalysisOptions(fileExtension) {
-        isDriverFile = fileExtension.toLowerCase() === 'sys';
-        const family = isDriverFile ? 'driver' : 'regular';
+        const ext = (fileExtension || '').toLowerCase();
+        isDriverFile = DRIVER_EXTS.has(ext);
+        const family = isDriverFile ? 'driver'
+                     : OFFICE_EXTS.has(ext) ? 'office'
+                     : HTML_EXTS.has(ext)   ? 'html'
+                     : 'regular';

        const tabs = document.querySelectorAll('#modeTabs .lb-tab');
        const bodies = document.querySelectorAll('.lb-mode-body');

-        // Show only tabs for this file family; active state moves to first.
+        // Show only tabs whose `data-family` list contains this file's family.
+        // Multiple families are space-separated (e.g. `regular office` for the
+        // Static tab, which serves both classes).
        let firstVisible = null;
        tabs.forEach(t => {
-            const matches = t.dataset.family === family;
+            const families = (t.dataset.family || '').split(/\s+/);
+            const matches = families.includes(family);
            t.classList.toggle('hidden', !matches);
            t.classList.remove('active');
            if (matches && !firstVisible) firstVisible = t;
@@ -245,6 +272,7 @@ document.addEventListener('DOMContentLoaded', function() {
    function renderFileTypeSpecificInfo(fileInfo) {
        elements.peInfo.classList.add('hidden');
        elements.officeInfo.classList.add('hidden');
+        if (elements.htmlSmuggleInfo) elements.htmlSmuggleInfo.classList.add('hidden');
        elements.suspiciousImports.classList.add('hidden');

        if (fileInfo.entropy_analysis) {
@@ -359,24 +387,7 @@ document.addEventListener('DOMContentLoaded', function() {
        }
        else if (fileInfo.office_info) {
            elements.officeInfo.classList.remove('hidden');
-            const office = fileInfo.office_info;
-
-            elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${
-                office.has_macros ? 'bg-red-500/8 text-red-300 border border-red-500/22' : 'bg-green-500/8 text-green-300 border border-green-500/22'
-            }`;
-            elements.macroStatus.textContent = office.has_macros ? 'Macros Present' : 'No Macros';
-
-            if (office.detection_notes && office.detection_notes.length > 0) {
-                elements.macroDetectionNotes.innerHTML = office.detection_notes.map(note => `
-                    <div class="flex items-center space-x-2">
-                        <svg class="w-4 h-4 text-yellow-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
-                            <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" 
-                                d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z"/>
-                        </svg>
-                        <span>${note}</span>
-                    </div>
-                `).join('');
-            }
+            renderOfficeInfo(fileInfo.office_info);
        }
        else if (fileInfo.lnk_info) {
            // Show LNK-specific information section
@@ -384,6 +395,293 @@ document.addEventListener('DOMContentLoaded', function() {
            lnkInfoSection.classList.remove('hidden');
            renderLnkInfo(fileInfo.lnk_info);
        }
+        else if (fileInfo.html_smuggle_info) {
+            const htmlSection = document.getElementById('htmlSmuggleInfo');
+            if (htmlSection) htmlSection.classList.remove('hidden');
+            renderHtmlSmuggleInfo(fileInfo.html_smuggle_info);
+        }
+    }
+
+    // -- Office macro / template-injection rendering --------------------
+    //
+    // Surfaces every non-empty piece of the `office_info` structure:
+    //   * Status pill: Macros Present / No Macros
+    //   * Detection notes (one-line summaries)
+    //   * Autoexec triggers           (table: keyword + description)
+    //   * Suspicious keywords         (table: keyword + description)
+    //   * IOCs                        (table: type + value)
+    //   * External refs               (table: relationship + target -- T1221 etc.)
+    //   * Per-module VBA source code  (collapsible <details>)
+    //   * Hex / Base64 / VBA strings  (collapsible)
+    //
+    // The DOM container (#officeInfo) already exists in upload.html; this
+    // function rewrites #macroDetectionNotes (status notes) and #macroInfo
+    // (detail blocks) every time it runs.
+    function escapeHtml(s) {
+        return String(s ?? '').replace(/[&<>"']/g, c => (
+            { '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;' }[c]
+        ));
+    }
+
+    function macroSeverityClass(office) {
+        // Treat external attachedTemplate references and live macros as the
+        // strong signals. Everything else goes "info".
+        if (office.has_macros) return 'critical';
+        if ((office.external_refs || []).some(r => r.relationship === 'attachedTemplate')) return 'critical';
+        if ((office.external_refs || []).length > 0) return 'medium';
+        return 'low';
+    }
+
+    function renderTable(headers, rows) {
+        if (!rows.length) return '';
+        const head = headers.map(h => `<th style="text-align:left;padding:4px 8px;border-bottom:1px solid var(--lb-border);font-size:11px;color:var(--lb-text-dim);text-transform:uppercase;letter-spacing:0.5px;">${escapeHtml(h)}</th>`).join('');
+        const body = rows.map(r => `<tr>${r.map(c => `<td style="padding:4px 8px;font-size:12px;vertical-align:top;border-bottom:1px solid rgba(255,255,255,0.04);">${c}</td>`).join('')}</tr>`).join('');
+        return `<table style="width:100%;border-collapse:collapse;margin:6px 0 12px 0;"><thead><tr>${head}</tr></thead><tbody>${body}</tbody></table>`;
+    }
+
+    function renderSection(title, body, opts) {
+        opts = opts || {};
+        if (!body) return '';
+        const collapsible = opts.collapsible;
+        const open = opts.open === undefined ? false : opts.open;
+        const heading = `<div style="font-size:11px;color:var(--lb-text-dim);text-transform:uppercase;letter-spacing:0.5px;margin:14px 0 4px 0;">${escapeHtml(title)}</div>`;
+        if (collapsible) {
+            return `${heading}<details ${open ? 'open' : ''} style="border:1px solid var(--lb-border);padding:8px;border-radius:3px;background:rgba(255,255,255,0.02);"><summary style="cursor:pointer;font-size:12px;color:var(--lb-text);">${escapeHtml(opts.summary || 'show')}</summary>${body}</details>`;
+        }
+        return `${heading}${body}`;
+    }
+
+    function renderOfficeInfo(office) {
+        // Status pill
+        const sev = macroSeverityClass(office);
+        const sevClassMap = {
+            critical: 'bg-red-500/8 text-red-300 border border-red-500/22',
+            medium:   'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22',
+            low:      'bg-green-500/8 text-green-300 border border-green-500/22',
+        };
+        elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`;
+        elements.macroStatus.textContent = office.has_macros
+            ? 'Macros Present'
+            : ((office.external_refs || []).length > 0 ? 'External Refs' : 'No Macros');
+
+        // Top-level detection notes (one-line summaries)
+        const notes = office.detection_notes || [];
+        elements.macroDetectionNotes.innerHTML = notes.map(note => `
+            <div style="display:flex;align-items:flex-start;gap:6px;margin-bottom:3px;">
+                <span style="color:var(--lb-warn);">⚠</span>
+                <span>${escapeHtml(note)}</span>
+            </div>
+        `).join('');
+
+        // Detailed sections
+        const macroInfo = document.getElementById('macroInfo');
+        if (!macroInfo) return;
+        const parts = [];
+
+        // External references (T1221 etc.) -- shown FIRST when present
+        // since they're often the only signal for documents that have no VBA.
+        const refs = office.external_refs || [];
+        if (refs.length > 0) {
+            const rows = refs.map(r => [
+                `<span class="lb-tag ${r.relationship === 'attachedTemplate' ? 'critical' : 'medium'}">${escapeHtml(r.relationship)}</span>`,
+                `<span class="lb-mono" style="word-break:break-all;font-size:11px;"><a href="${escapeHtml(r.target)}" target="_blank" rel="noopener noreferrer" style="color:var(--lb-accent-soft);">${escapeHtml(r.target)}</a></span>`,
+                `<span class="lb-mono" style="font-size:11px;color:var(--lb-text-dim);">${escapeHtml(r.rels_file)}</span>`,
+            ]);
+            parts.push(renderSection('External References (Remote Targets)', renderTable(['Relationship', 'Target', 'In .rels'], rows)));
+        }
+
+        const a = office.analysis || {};
+
+        // Autoexec triggers
+        if ((a.autoexec || []).length > 0) {
+            const rows = a.autoexec.map(e => [
+                `<span class="lb-tag critical">${escapeHtml(e.keyword || '?')}</span>`,
+                `<span style="font-size:12px;">${escapeHtml(e.description || '')}</span>`,
+            ]);
+            parts.push(renderSection(`Auto-Execution Triggers (${a.autoexec.length})`, renderTable(['Keyword', 'Description'], rows)));
+        }
+
+        // Suspicious keywords
+        if ((a.suspicious || []).length > 0) {
+            const rows = a.suspicious.map(e => [
+                `<span class="lb-tag medium">${escapeHtml(e.keyword || '?')}</span>`,
+                `<span style="font-size:12px;">${escapeHtml(e.description || '')}</span>`,
+            ]);
+            parts.push(renderSection(`Suspicious Keywords (${a.suspicious.length})`, renderTable(['Keyword', 'Description'], rows)));
+        }
+
+        // IOCs (URLs, IPs, EXEs, etc. that olevba pulled out of the macro body)
+        if ((a.iocs || []).length > 0) {
+            const rows = a.iocs.map(ioc => [
+                `<span class="lb-tag info">${escapeHtml(ioc.type || '?')}</span>`,
+                `<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(ioc.value || '')}</span>`,
+            ]);
+            parts.push(renderSection(`IOCs Extracted from Macro (${a.iocs.length})`, renderTable(['Type', 'Value'], rows)));
+        }
+
+        // Hex / Base64 / VBA-encoded strings (decoded by olevba)
+        const stringSets = [
+            ['Hex Strings', a.hex_strings || []],
+            ['Base64 Strings', a.base64_strings || []],
+            ['VBA-Encoded Strings', a.vba_strings || []],
+        ];
+        for (const [label, items] of stringSets) {
+            if (items.length === 0) continue;
+            const body = items.map(e => `<div class="lb-mono" style="word-break:break-all;font-size:11px;padding:3px 0;border-bottom:1px solid rgba(255,255,255,0.04);"><strong>${escapeHtml(e.keyword || '')}:</strong> ${escapeHtml(e.description || '')}</div>`).join('');
+            parts.push(renderSection(`${label} (${items.length})`, body, { collapsible: true, summary: `${items.length} item(s) -- click to expand` }));
+        }
+
+        // Per-module VBA source code -- collapsible
+        const modules = office.modules || [];
+        if (modules.length > 0) {
+            const body = modules.map(m => `
+                <div style="margin-top:8px;">
+                    <div style="font-size:12px;color:var(--lb-text);margin-bottom:4px;">
+                        <span class="lb-mono" style="color:var(--lb-accent-soft);">${escapeHtml(m.vba_filename || '?')}</span>
+                        <span class="lb-muted" style="font-size:11px;"> -- ${escapeHtml(m.stream || '')}</span>
+                    </div>
+                    <pre style="background:rgba(0,0,0,0.3);padding:8px;border:1px solid var(--lb-border);font-size:11px;overflow-x:auto;max-height:240px;overflow-y:auto;white-space:pre-wrap;color:var(--lb-text);">${escapeHtml(m.code || '')}</pre>
+                </div>
+            `).join('');
+            parts.push(renderSection(`VBA Source (${modules.length} module${modules.length !== 1 ? 's' : ''})`, body, { collapsible: true, summary: `${modules.length} module(s) -- click to view source code` }));
+        }
+
+        macroInfo.innerHTML = parts.join('');
+    }
+
+    // -- HTML smuggling rendering --------------------------------------
+    //
+    // Surfaces every non-empty piece of the `html_smuggle_info` structure
+    // produced by app/utils/htmlsmuggle.py:
+    //   * Status pill: SMUGGLING / SUSPICIOUS / CLEAN with score
+    //   * Detection notes (one-line summaries)
+    //   * Score bar + matched-categories pill row
+    //   * Matched patterns         (table: name + category + weight)
+    //   * Surface features         (table: feature + value)
+    //   * IOCs                     (download filenames, dataset blobs, largest base64 preview)
+    //
+    // Reuses the renderTable / renderSection / escapeHtml helpers defined
+    // for the office macro renderer.
+    function smuggleSeverityClass(h) {
+        if (h.is_smuggling) return 'critical';
+        if ((h.score || 0) > 0) return 'medium';
+        return 'low';
+    }
+
+    function renderHtmlSmuggleInfo(h) {
+        // Status pill
+        const sev = smuggleSeverityClass(h);
+        const sevClassMap = {
+            critical: 'bg-red-500/8 text-red-300 border border-red-500/22',
+            medium:   'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22',
+            low:      'bg-green-500/8 text-green-300 border border-green-500/22',
+        };
+        if (elements.smuggleStatus) {
+            elements.smuggleStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`;
+            const label = h.is_smuggling
+                ? `SMUGGLING (score ${h.score}/${h.threshold})`
+                : (h.score > 0 ? `SUSPICIOUS (score ${h.score}/${h.threshold})` : 'CLEAN');
+            elements.smuggleStatus.textContent = label;
+        }
+
+        // Detection notes
+        const notes = h.detection_notes || [];
+        if (elements.smuggleDetectionNotes) {
+            elements.smuggleDetectionNotes.innerHTML = notes.map(note => `
+                <div style="display:flex;align-items:flex-start;gap:6px;margin-bottom:3px;">
+                    <span style="color:var(--lb-warn);">⚠</span>
+                    <span>${escapeHtml(note)}</span>
+                </div>
+            `).join('');
+        }
+
+        // Detail blocks
+        const host = elements.smuggleInfo;
+        if (!host) return;
+        const parts = [];
+
+        // Score line + matched-category pills
+        const cats = h.matched_categories || {};
+        if (Object.keys(cats).length > 0) {
+            const pills = Object.entries(cats).map(([cat, count]) =>
+                `<span class="lb-tag medium" style="margin-right:4px;">${escapeHtml(cat)} × ${count}</span>`
+            ).join(' ');
+            parts.push(renderSection('Pattern Categories', `<div style="padding:4px 0;">${pills}</div>`));
+        }
+
+        // Matched patterns -- the actual signatures that fired
+        const matches = h.matched_patterns || [];
+        if (matches.length > 0) {
+            const rows = matches.map(m => [
+                `<span class="lb-mono" style="font-size:11px;">${escapeHtml(m.name)}</span>`,
+                `<span class="lb-tag info">${escapeHtml(m.category || '?')}</span>`,
+                `<span class="lb-mono" style="font-size:11px;">+${m.weight || 0}</span>`,
+            ]);
+            parts.push(renderSection(`Matched Patterns (${matches.length})`, renderTable(['Pattern', 'Category', 'Weight'], rows)));
+        }
+
+        // Surface features
+        const f = h.features || {};
+        if (Object.keys(f).length > 0) {
+            const featureRows = [
+                ['File size (bytes)', f.file_size],
+                ['Script tags', f.script_tags],
+                ['iframe tags', f.iframe_tags],
+                ['embed tags', f.embed_tags],
+                ['Base64 blob count (>=50 chars)', f.base64_blob_count],
+                ['Largest base64 blob (chars)', f.largest_base64_chars],
+                ['Has blob()', f.has_blob],
+                ['Has atob()', f.has_atob],
+                ['Has Uint8Array', f.has_uint8array],
+                ['Has URL.createObjectURL', f.has_createobjecturl],
+                ['Has <a download="...">', f.has_download_attr],
+                ['Has String.fromCharCode', f.has_fromcharcode],
+            ].filter(([, v]) => v !== undefined && v !== null && v !== false && v !== 0)
+             .map(([label, v]) => [
+                `<span style="font-size:12px;">${escapeHtml(label)}</span>`,
+                `<span class="lb-mono" style="font-size:12px;">${escapeHtml(String(v))}</span>`,
+             ]);
+            if (featureRows.length > 0) {
+                parts.push(renderSection('Surface Features', renderTable(['Feature', 'Value'], featureRows)));
+            }
+        }
+
+        // IOCs
+        const iocs = h.iocs || {};
+        const iocBits = [];
+        if ((iocs.download_filenames || []).length > 0) {
+            const rows = iocs.download_filenames.map(name => [
+                `<span class="lb-tag medium">download=</span>`,
+                `<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(name)}</span>`,
+            ]);
+            iocBits.push(renderTable(['Type', 'Value'], rows));
+        }
+        if ((iocs.data_file_attrs || []).length > 0) {
+            const rows = iocs.data_file_attrs.map(d => [
+                `<span class="lb-tag medium">data-file=</span>`,
+                `<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(d)}</span>`,
+            ]);
+            iocBits.push(renderTable(['Type', 'Value (truncated)'], rows));
+        }
+        if (iocs.largest_base64_blob && iocs.largest_base64_blob.length > 0) {
+            const b = iocs.largest_base64_blob;
+            iocBits.push(`
+                <div class="lb-mono" style="font-size:11px;padding:4px 0;">
+                    <div><strong>Largest base64 blob:</strong> ${b.length} chars</div>
+                    <div style="margin-top:4px;color:var(--lb-text-dim);">First 120: <span style="color:var(--lb-text);word-break:break-all;">${escapeHtml(b.preview_first_120)}</span></div>
+                    ${b.preview_last_120 ? `<div style="margin-top:4px;color:var(--lb-text-dim);">Last 120: <span style="color:var(--lb-text);word-break:break-all;">${escapeHtml(b.preview_last_120)}</span></div>` : ''}
+                </div>
+            `);
+        }
+        if (iocBits.length > 0) {
+            parts.push(renderSection('IOCs', iocBits.join('')));
+        }
+
+        if (h.truncated) {
+            parts.push(`<div class="lb-muted" style="font-size:11px;margin-top:8px;">⚠ Scan was truncated -- file exceeds the 5 MiB cap.</div>`);
+        }
+
+        host.innerHTML = parts.join('');
    }

    function getRuntimeConfig(buildWith) {
@@ -75,7 +75,7 @@
                <svg fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M4 6h16M4 12h16M4 18h7"/><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M14 16l3 3 5-5"/></svg>
                All
            </button>
-            <button type="button" class="lb-tab" data-mode="static" data-family="regular">
+            <button type="button" class="lb-tab" data-mode="static" data-family="regular office html">
                <svg fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"/></svg>
                Static
            </button>
@@ -363,6 +363,16 @@
                <div id="macroInfo" style="font-size: 12px; color: var(--lb-text-dim);"></div>
            </div>

+            <!-- HTML smuggling info -->
+            <div id="htmlSmuggleInfo" class="hidden" style="border: 1px solid var(--lb-border); padding: 12px; margin-bottom: 12px;">
+                <div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 8px;">
+                    <span class="lb-eyebrow">HTML Smuggling Analysis</span>
+                    <span id="smuggleStatus" class="lb-tag muted"></span>
+                </div>
+                <div id="smuggleDetectionNotes" style="font-size: 12px; color: var(--lb-text-dim); margin-bottom: 6px;"></div>
+                <div id="smuggleInfo" style="font-size: 12px; color: var(--lb-text-dim);"></div>
+            </div>
+
            <!-- File-specific info -->
            <div id="fileSpecificInfo" style="font-size: 13px; color: var(--lb-text-dim);"></div>
        </div>
@@ -8,8 +8,6 @@ Prefer importing directly from submodules in new code:
 from .file_io import (
    FileTypeDetector,
    detect_file_type,
-    get_lnk_info,
-    get_office_info,
    get_pe_info,
    save_uploaded_file,
 )
@@ -19,6 +17,9 @@ from .forensics import (
    calculate_entropy,
    get_security_analyzer,
 )
+from .htmlsmuggle import get_html_smuggle_info
+from .lnk import get_lnk_info
+from .office import get_office_info
 from .json_helpers import (
    extract_detection_counts,
    format_hex,
@@ -41,7 +42,7 @@ __all__ = [
    'allowed_file', 'calculate_entropy', 'calculate_risk', 'calculate_yara_risk',
    'check_tool', 'detect_file_type', 'extract_detection_counts',
    'find_file_by_hash', 'format_hex', 'format_size', 'generate_html_report',
-    'get_entropy_risk_level', 'get_lnk_info', 'get_office_info', 'get_pe_info',
-    'get_risk_level', 'get_security_analyzer', 'load_json_file',
-    'save_uploaded_file', 'validate_pid',
+    'get_entropy_risk_level', 'get_html_smuggle_info', 'get_lnk_info',
+    'get_office_info', 'get_pe_info', 'get_risk_level',
+    'get_security_analyzer', 'load_json_file', 'save_uploaded_file', 'validate_pid',
 ]
@@ -1,5 +1,15 @@
 # app/utils/file_io.py
-"""File ingestion: type detection, PE/Office/LNK metadata, upload handling."""
+"""File ingestion: type detection, upload handling.
+
+Per-file-type inspectors (PE / Office / LNK / HTML-smuggling) are dispatched
+from `save_uploaded_file` based on the detected family. Each inspector lives
+in its own module:
+
+  * PE                  -- get_pe_info (this module, uses forensics.SecurityAnalyzer)
+  * Office              -- utils/office.py  (get_office_info)
+  * LNK                 -- utils/lnk.py     (get_lnk_info)
+  * HTML smuggling      -- utils/htmlsmuggle.py (get_html_smuggle_info)
+"""
 import datetime
 import hashlib
 import json
@@ -11,8 +21,10 @@ import struct
 import pefile
 from werkzeug.utils import secure_filename

-from ..analyzers.static.lnk_parser import LnkForensics
 from .forensics import calculate_entropy, get_security_analyzer
+from .htmlsmuggle import get_html_smuggle_info
+from .lnk import get_lnk_info
+from .office import get_office_info
 from .risk_analyzer import RiskCalculator


@@ -42,6 +54,12 @@ class FileTypeDetector:
            elif header.startswith(cls.LNK_HEADER):
                return cls._detect_lnk_type(filepath)

+            # HTML / HTM detection -- file-extension based since HTML has no
+            # consistent magic. Cheap to check after the binary-header tests
+            # already missed.
+            if p.suffix.lower() in ('.html', '.htm'):
+                return {"family": "html", "type": p.suffix.lower().lstrip('.')}
+
            return {"family": "unknown", "type": "unknown"}

        except Exception as e:
@@ -142,8 +160,20 @@ class FileTypeDetector:
                        "visio/document.xml": "vsdx",
                    }

+                    # Flag macro-enabled OOXML by presence of vbaProject.bin --
+                    # promotes docx/xlsx/pptx -> docm/xlsm/pptm so the dashboard
+                    # Type field reflects what's actually in the container.
+                    has_vba = any(n.endswith("vbaproject.bin") for n in names)
+                    macro_enabled_map = {
+                        "docx": "docm",
+                        "xlsx": "xlsm",
+                        "pptx": "pptm",
+                    }
+
                    for path, file_type in ooxml_types.items():
                        if path in names:
+                            if has_vba and file_type in macro_enabled_map:
+                                file_type = macro_enabled_map[file_type]
                            return {"family": "office", "type": file_type}

                    return {"family": "office", "type": "ooxml-unknown"}
@@ -331,24 +361,9 @@ def _build_pe_detection_notes(is_valid_checksum, suspicious_imports,
    return detection_notes


-def get_office_info(filepath, malapi_path):
-    """Analyze Office macros (delegates to SecurityAnalyzer)."""
-    return get_security_analyzer(malapi_path).analyze_office_macros(filepath)
-
-
-def get_lnk_info(filepath):
-    """Analyze a Windows .LNK shortcut for forensic data."""
-    try:
-        lnk = LnkForensics(filepath)
-        if not lnk.is_valid():
-            return {'lnk_info': None}
-
-        forensic_data = lnk.get_forensic_data()
-        return {'lnk_info': forensic_data}
-
-    except Exception as e:
-        print(f"Error analyzing LNK file: {e}")
-        return {'lnk_info': None}
+# Office / LNK / HTML-smuggling inspectors live in their own modules
+# (imported at the top of this file). PE inspection stays here because it's
+# tightly coupled to the SecurityAnalyzer cache (MalAPI lookup + entropy).


 def _build_entropy_analysis(entropy_value):
@@ -456,6 +471,11 @@ def save_uploaded_file(file, config):
        if 'error' not in lnk_result:
            file_info.update(lnk_result)

+    elif file_type_info['family'] == 'html':
+        # Always update -- get_html_smuggle_info returns a usable dict even
+        # for clean files (just with is_smuggling=false / score=0).
+        file_info.update(get_html_smuggle_info(filepath))
+
    with open(os.path.join(result_folder, filename, 'file_info.json'), 'w') as f:
        json.dump(file_info, f)

@@ -1,11 +1,15 @@
 # app/utils/forensics.py
-"""PE/Office forensic analysis: entropy, runtime detection, MalAPI lookup."""
+"""PE forensic analysis: entropy, runtime detection, MalAPI lookup.
+
+Office / LNK / HTML-smuggling analyzers live in their own modules
+(`utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`) so each file-type
+inspector is self-contained and easy to maintain. This module is now strictly
+PE-focused.
+"""
 import json
 import math
 from collections import Counter

-from oletools.olevba import VBA_Parser
-

 # Known runtime imports for compiled languages — used to flag PE imports as
 # benign-runtime rather than suspicious.
@@ -251,46 +255,6 @@ class SecurityAnalyzer:

        return sections_info

-    def analyze_office_macros(self, filepath):
-        """Inspect Office VBA macros for suspicious patterns."""
-        try:
-            vbaparser = VBA_Parser(filepath)
-            detection_notes = []
-
-            info = {
-                'file_type': 'Microsoft Office Document',
-                'has_macros': vbaparser.detect_vba_macros(),
-                'macro_info': None,
-                'detection_notes': detection_notes,
-            }
-
-            if vbaparser.detect_vba_macros():
-                macro_analysis = vbaparser.analyze_macros()
-                info['macro_info'] = macro_analysis
-
-                macro_text = str(macro_analysis).lower()
-                detection_patterns = {
-                    'shell': 'Shell command execution detected',
-                    'wscript': 'WScript execution detected',
-                    'powershell': 'PowerShell execution detected',
-                    'http': 'Network communication detected',
-                    'auto': 'Auto-execution mechanism detected',
-                    'document_open': 'Document open auto-execution',
-                    'windowshide': 'Hidden window execution',
-                    'createobject': 'COM object creation detected',
-                }
-
-                for pattern, note in detection_patterns.items():
-                    if pattern in macro_text:
-                        detection_notes.append(note)
-
-            vbaparser.close()
-            return {'office_info': info}
-        except Exception as e:
-            print(f"Error analyzing Office file: {e}")
-            return {'office_info': None}
-
-
 _security_analyzer_cache = {}


@@ -0,0 +1,282 @@
+# app/utils/htmlsmuggle.py
+"""HTML-smuggling pattern scanner.
+
+Runs at upload time on `.html` / `.htm` files (alongside `get_pe_info`,
+`get_office_info`, `get_lnk_info`). Output lands in `file_info.html_smuggle_info`
+and is rendered on the upload-result page the same way office_info is.
+
+Pattern set + scoring model ported from SmuggleShield's `content.js`
+(https://github.com/RootUp/SmuggleShield). The browser extension catches
+runtime behaviour (DOM mutation, blob URL revoke, programmatic <a download>
+click); we catch the file-on-disk equivalent by regex-scanning the raw
+HTML source.
+
+Scoring (mirrors SmuggleShield):
+  - Each pattern carries a weight (2-4).
+  - High-weight (>=3) patterns scanned first; early-return when the score
+    crosses the threshold.
+  - Low-weight (<3) patterns scanned only when high-weight pass landed
+    within `threshold - 2` of crossing.
+  - A cheap pre-filter (`atob | blob | base64 | createobjecturl | ...`)
+    skips files that obviously aren't smuggling.
+"""
+
+import os
+import re
+from typing import Dict, List
+
+
+# (weight, pattern, name, category)
+_PATTERNS = [
+    # --- Direct base64 -> binary -> blob path ----------------------------
+    (3, r'atob\s*\([^)]+\).*new\s+uint8array',                                                              'atob_to_uint8array',           'encoding'),
+    (3, r"atob\s*\(\s*['\"]([A-Za-z0-9+/=]{100,})['\"].*\)",                                                'large_base64_atob',            'encoding'),
+    (3, r'new\s+blob\s*\(\s*\[\s*(?:data|atob\s*\()',                                                       'blob_from_atob_data',          'blob'),
+    (4, r"let\s+arrayBuffer\s*=\s*\['0x[0-9a-f]{2}'(?:\s*,\s*'0x[0-9a-f]{2}')+\]",                          'hex_array_buffer',             'encoding'),
+
+    # --- Reversed-string fromCharCode obfuscation ------------------------
+    (4, r'\["edoCrahCmorf"(?:\s*\[\s*"split"\s*\]\s*\(\s*""\s*\)\s*\[\s*"reverse"\s*\]\s*\(\s*\)\s*\[\s*"join"\s*\]\s*\(\s*""\s*\))', 'reversed_fromcharcode_obf', 'obfuscation'),
+    (4, r'setTimeout\s*\(\s*\[.*?\]\.map\s*\(\s*.*?=>.*?(?:fromCharCode|edoCrahCmorf).*?\/\s*\d+\s*\)',     'settimeout_fromcharcode',      'obfuscation'),
+    (3, r'String\s*\[\s*(?:"edoCrahCmorf"|[\'"][^\'\"]+[\'"]\.split\([\'"][\'"]\)\.reverse\(\)\.join\([\'"][\'"]\))\s*\]', 'string_reverse_index', 'obfuscation'),
+
+    # --- Blob -> object URL -> download chain ----------------------------
+    (3, r'url\.createobjecturl\s*\(\s*(?:my)?blob\s*\)',                                                    'createobjecturl_from_blob',    'blob'),
+    (3, r'location(?:\s*\[\s*[\'"]href[\'"]\s*\])?\s*=\s*url',                                              'location_href_assign',         'writer'),
+    (2, r'url\.revokeobjecturl\s*\(\s*url\s*\)',                                                            'revokeobjecturl',              'blob'),
+    (3, r'\.style\s*=\s*[\'"]display:\s*none[\'"].*\.href\s*=.*\.download\s*=',                             'hidden_anchor_download',       'writer'),
+    (3, r'\.click\s*\(\s*\).*url\.revokeobjecturl',                                                         'auto_click_then_revoke',       'writer'),
+    (3, r'href\s*=\s*["\']data:(?:application/octet-stream|image/svg\+xml);base64,',                        'data_url_octet_stream',        'writer'),
+
+    # --- Bracket-string property access (window["a"+"to"+"b"] etc.) ------
+    (3, r'window\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]',                                      'window_bracket_concat',        'obfuscation'),
+    (4, r'document\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]\s*\(\s*window\s*\[\s*(?:[\'"]at[\'"].*[\'"]o[\'"].*[\'"]b[\'"]\s*\]|\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\])\s*\([\'"][A-Za-z0-9+/=]+[\'"]\)\s*\)', 'document_bracket_atob', 'obfuscation'),
+    (4, r'var\s+\w+=\w+;?\s*\(function\(\w+,\w+\)\{.*while\(!!\[\]\)\{try\{.*parseint.*\}catch\(\w+\)\{.*\}\}\(.*\)\);?', 'parseint_obfuscator',    'obfuscation'),
+
+    # --- Blob mime-type signatures + writer chain ------------------------
+    (3, r'blob\s*\(\s*\[[^\]]+\]\s*,\s*\{\s*type\s*:\s*[\'"](?:application/octet-stream|text/html|octet/stream)[\'"](?:\s*,\s*encoding\s*:\s*[\'"]base64[\'"])?\s*\}\s*\)', 'blob_with_octet_type', 'blob'),
+
+    # --- WebAssembly / Go runtime smuggling ------------------------------
+    (3, r'webassembly\s*\.\s*(?:instantiate(?:streaming)?|instance)',                                       'webassembly_instantiate',      'wasm'),
+    (2, r'navigator\.serviceworker\.register',                                                              'service_worker_register',      'wasm'),
+    (2, r'wasm[_-]?exec\.js',                                                                               'wasm_exec_js',                 'wasm'),
+    (3, r'\.wasm\b',                                                                                        'wasm_extension_ref',           'wasm'),
+    (3, r'new\s+go\s*\(\s*\)',                                                                              'go_runtime_new',               'wasm'),
+    (3, r'go\s*\.\s*run\s*\(',                                                                              'go_runtime_run',               'wasm'),
+
+    # --- Embedded srcdoc / iframe + script -------------------------------
+    (3, r'srcdoc\s*=\s*["\'][^"\']*<script',                                                                'srcdoc_with_script',           'writer'),
+    (3, r'<embed[^>]*base64',                                                                               'embed_with_base64',            'writer'),
+
+    # --- Decoder helpers + legacy IE save -------------------------------
+    (3, r'function\s+(?:b64toarray|xor|base64toarraybuffer)\s*\([^)]*\)\s*\{[\s\S]*?return\s+(?:bytes\.buffer|result);?\}', 'decoder_helper_func', 'encoding'),
+    (3, r'document\.createelement\([\'"]embed[\'"]\)',                                                      'createelement_embed',          'writer'),
+    (2, r'\.setattribute\([\'"]src[\'"]\s*,\s*.*\)',                                                        'setattribute_src',             'writer'),
+    (3, r'window\.navigator\.mssaveoropenblob\s*\(\s*blob\s*,\s*filename\s*\)',                             'mssaveoropenblob',             'writer'),
+    (2, r'(?:window\.)?url\.createobjecturl\s*\(\s*(?:blob|[^)]+)\s*\)',                                    'generic_createobjecturl',      'blob'),
+    (2, r'(?:a|element)\.download\s*=\s*(?:filename|[\'"][^\'"]+[\'"])',                                    'anchor_download_attr',         'writer'),
+    (2, r'string\.fromcharcode\(.*\)',                                                                      'string_fromcharcode',          'encoding'),
+    (2, r'\.charcodeat\(.*\)',                                                                              'charcodeat',                   'encoding'),
+    (3, r'document\.getelementbyid\([\'"]passwordid[\'"]\)\.value',                                         'password_field_lookup',        'writer'),
+    (3, r'import\s*\(\s*url\.createobjecturl\s*\(',                                                         'dynamic_import_objurl',        'wasm'),
+    (3, r'\w+\s*\(\s*\w+\s*\(\s*[\'"][A-Za-z0-9+/=]{50,}[\'"]\s*\)\s*\)',                                   'nested_call_long_b64',         'encoding'),
+    (2, r'(?:window\.)?atob\s*\(',                                                                          'atob_call',                    'encoding'),
+    (2, r'uint8[aA]rray\s*\(\s*(?:(?!len)[^)])*\)',                                                         'uint8array_constructor',       'encoding'),
+    (3, r'mssaveoropenblob|mssaveblob',                                                                     'mssave_alias',                 'writer'),
+    (3, r'base64toarraybuffer',                                                                             'b64_to_arraybuffer_helper',    'encoding'),
+    (3, r'xmlhttprequest\(\).*\.responsetype\s*=\s*[\'"]arraybuffer[\'"]',                                  'xhr_arraybuffer_response',     'encoding'),
+    (3, r'new\s+dataview\(.*\).*\.getuint8\(.*\).*\.setuint8\(',                                            'dataview_getset_uint8',        'encoding'),
+    (2, r'[^\w](\w+)\s*=\s*(\w+)\s*\^\s*(\w+)',                                                             'xor_operation',                'encoding'),
+    (2, r'\.slice\(\s*\w+\s*-\s*\d+\s*,\s*\w+\s*-\s*\d+\s*\)',                                              'string_slice_offset',          'obfuscation'),
+    (3, r'for\s*\([^)]+\)\s*\{[^}]*string\.fromcharcode\([^)]+\)',                                          'loop_fromcharcode',            'encoding'),
+
+    # --- GWT (Google Web Toolkit) smuggling artefacts --------------------
+    (4, r'\$wnd\s*=\s*window;\s*\$doc\s*=\s*\$wnd\.document',                                               'gwt_wnd_doc',                  'gwt'),
+    (4, r'__gwt_(?:isKnownPropertyValue|getMetaProperty|marker|stylesLoaded|scriptsLoaded)',                'gwt_internals',                'gwt'),
+    (3, r'\$strongName\s*=\s*[\'"][0-9A-F]{32}[\'"]',                                                       'gwt_strong_name',              'gwt'),
+    (3, r'\$gwt_version\s*=\s*[\'"][0-9.]+[\'"]',                                                           'gwt_version',                  'gwt'),
+    (4, r'(?:function|var)\s+[a-zA-Z$_]+\s*=\s*\{\s*[a-zA-Z$_]+:\s*window,\s*[a-zA-Z$_]+:\s*document\s*\}', 'gwt_window_doc_pair',          'gwt'),
+    (3, r'\b(?:gwtOnLoad|__gwtStatsEvent|gwtOnLoadFunc)\b',                                                 'gwt_onload',                   'gwt'),
+    (3, r'\.setAttribute\([\'"]__gwt_property[\'"]',                                                        'gwt_property_attr',            'gwt'),
+    (4, r'document\.createElement\([\'"]script[\'"]\).*?\.src\s*=.*?\.cache\.js',                           'gwt_cache_js',                 'gwt'),
+
+    # --- Mouse/event-triggered drop chains -------------------------------
+    (4, r'(?:document|window)\.on(?:mousemove|load|mouseover)\s*=\s*function\s*\(\s*\)\s*\{[^}]*?data:application/[^}]*?\.click\(\)[^}]*?(?:removeChild|remove)\(', 'mouse_event_drop', 'writer'),
+    (4, r'(?:window|var|let)\.\w+Triggered\s*=\s*(?:true|false).*?(?:navigator|platform).*?data:application/[^;]+;base64,.*?\.(?:download|click)', 'triggered_flag_drop', 'writer'),
+    (4, r'navigator\[?["\']platform["\']\]?.*?(?:document|window)\.on\w+.*?data:application/',              'platform_event_drop',          'writer'),
+
+    # --- Generic split/concat/reverse obfuscation ------------------------
+    (3, r'\[[\'"][^\'\"]+[\'"]\s*\+\s*[\'"][^\'\"]+[\'"]\]',                                                'string_concat_index',          'obfuscation'),
+    (3, r"\[\'[a-z]+\'\s*\+\s*\'[a-z]+\'\]",                                                                'concat_lower_index',           'obfuscation'),
+    (3, r"\[\s*(?:[\'\"]\w?[\'\"](?:\s*,\s*)?){4,}\s*\]\.join\s*\(\s*[\'\"]*\s*\)",                         'array_join_join',              'obfuscation'),
+    (3, r'const\s+\w+\s*=\s*\[\s*(?:[\'"]\w?[\'"](?:\s*,\s*)?){4,}',                                        'const_char_array',             'obfuscation'),
+    (4, r'(\[(?:\][^(]*|\[\])[^(]*|\w+\.)constructor\s*\(\s*([\'"])return\s*\w+\2\s*\)',                    'constructor_return',           'obfuscation'),
+    (4, r'Function\s*\(\s*[\'"]return\s+\w+[\'"](?:\s*\)\s*\(\s*\)|\(\))',                                  'function_return',              'obfuscation'),
+    (3, r'\w+\.split\s*\(\s*[\'"][\'\"]?\s*\)\.reverse\s*\(\s*\)\.join\s*\(',                               'split_reverse_join',           'obfuscation'),
+    (3, r'\[\s*\w+\.split\s*\(\s*[\'"][\'"]\s*\)\.reverse\s*\(\s*\)',                                       'array_split_reverse',          'obfuscation'),
+    (3, r'setTimeout\s*\(\s*(?:function|\(\)|[^,]+)\s*(?:=>)?\s*\{[\s\S]{10,}?setTimeout\s*\(',             'nested_settimeout',            'obfuscation'),
+    (4, r'setTimeout\s*\([^{)]*\{[^{}]*setTimeout\s*\([^{)]*\{[^{}]*\}',                                    'double_settimeout',            'obfuscation'),
+    (4, r'new\s*\([^)]*\[\s*(?:[\'"][^\'\"]+[\'"]\.split|[\'"]\w+[\'"]\.split)',                            'new_with_split_index',         'obfuscation'),
+    (3, r'\[[^\]]*(?:join|reverse)[^\]]*\]\s*\(\s*(?:\w+|[\'"][^\'"]*[\'"])\s*\)',                          'index_join_reverse',           'obfuscation'),
+    (3, r'\[\s*(?:urlMethod|parts\.join\(\)|[\'"]\w+[\'"]\s*\+)',                                           'partsjoin_index',              'obfuscation'),
+    (4, r'\w+\s*\[\s*(?:[\'"][^\'\"]+[\'"](?:\s*\+\s*)?)+\s*\]\s*\(\s*\w+\s*\)',                            'concat_call',                  'obfuscation'),
+
+    # --- "down" + "load" decomposition (extremely common) ----------------
+    (3, r'[\'"]?down[\'"]?\s*\+\s*[\'"]?load[\'"]?',                                                        'down_plus_load',               'obfuscation'),
+    (4, r"\['down' \+ 'load'\]",                                                                            'down_load_bracket_exact',      'obfuscation'),
+    (4, r'createElement\s*\(\s*[\'"]a[\'"]\s*\)[^}]*?\[\s*[\'"]\w+[\'"]\s*\+\s*[\'"]\w+[\'"]\s*\]',         'createanchor_concat_attr',     'writer'),
+    (3, r"\['style'\]\['visi' \+ 'bility'\]",                                                               'visibility_concat',            'obfuscation'),
+
+    # --- Chunked-substr + dataset-based payload chains -------------------
+    (3, r'function\s+\w+Chunks\s*\([^)]*\)\s*\{[^{}]*for\s*\([^{}]*\)\s*\{[^{}]*substr',                    'chunk_substr_loop',            'encoding'),
+    (2, r'\.substr\s*\(\s*\w+\s*,\s*\w+Size\s*\)',                                                          'substr_size_param',            'encoding'),
+    (4, r'\(async\s*\(\s*\)\s*=>\s*\{\s*(?:let|var|const)\s+d\s*=.*?(?:document\.getElementById|document\.querySelector).*?dataset.*?\.href\s*=\s*d.*?\.download\s*=.*?\.click\s*\(\s*\)', 'async_dataset_click', 'writer'),
+    (4, r'\bdocument\.getElementById\s*\(\s*[\'"]data[\'"]\s*\).*?\.dataset\.file.*?createElement\s*\(\s*[\'"]a[\'"]\s*\).*?\.download\s*=', 'data_div_dataset_anchor', 'writer'),
+    (3, r'<div[^>]*id\s*=\s*["\']data["\'][^>]*data-file\s*=\s*["\'][A-Za-z0-9+/=]{50,}["\'][^>]*>',        'data_div_with_b64',            'writer'),
+    (4, r'<script>\s*\(\s*async\s*\(\s*\)\s*=>\s*\{[^}]*createElement\s*\(\s*[\'"]a[\'"]\s*\)[^}]*\.click\s*\(\s*\)[^}]*\.remove\s*\(\s*\)', 'inline_async_click_remove', 'writer'),
+    (4, r'\b(?:atob|decodeURIComponent)\s*\([^)]*(?:dataset|getAttribute)\s*\.[^)]*\)[^;]*\.href\s*=[^;]*\.download\s*=[^;]*\.click\s*\(\s*\)', 'decode_dataset_click', 'writer'),
+    (4, r'\bdocument\.body\.appendChild\s*\([^)]+\)[^;]*\.click\s*\(\s*\)[^;]*\.remove\s*\(\s*\)',          'append_click_remove',          'writer'),
+]
+
+# Quick-reject filter -- skip the full regex pass on obviously-clean HTML.
+_QUICK_CHECK = re.compile(
+    r'blob|atob|download|base64|arraybuffer|uint8array|createobjecturl|fromcharcode',
+    re.IGNORECASE,
+)
+_THRESHOLD = 4
+_MAX_BYTES = 5 * 1024 * 1024   # 5 MiB cap on what we read for the scan
+
+# Pre-compile patterns once at import time.
+_RE_FLAGS = re.IGNORECASE | re.DOTALL
+_COMPILED = [(w, re.compile(p, _RE_FLAGS), n, c) for w, p, n, c in _PATTERNS]
+_HIGH = [t for t in _COMPILED if t[0] >= 3]
+_LOW  = [t for t in _COMPILED if t[0] < 3]
+
+
+def get_html_smuggle_info(filepath: str) -> Dict:
+    """Public entry. Returns `{html_smuggle_info: {...}}` or `{html_smuggle_info: None}`
+    on read error -- mirrors `get_office_info` / `get_lnk_info` shape so file_io can
+    do `file_info.update(result)` without conditionals."""
+    try:
+        size = os.path.getsize(filepath)
+        with open(filepath, 'rb') as f:
+            raw = f.read(_MAX_BYTES)
+        content = raw.decode('utf-8', errors='replace')
+        truncated = size > len(raw)
+    except OSError as e:
+        return {'html_smuggle_info': {'error': f'read failed: {e}'}}
+
+    features = _features(content)
+    iocs = _iocs(content)
+
+    if not _QUICK_CHECK.search(content):
+        return {'html_smuggle_info': _build(False, 0, [], features, iocs, truncated)}
+
+    score, matches = _scan(content, _HIGH, _THRESHOLD)
+    if score < _THRESHOLD and score >= max(0, _THRESHOLD - 2):
+        extra_score, extra_matches = _scan(content, _LOW, _THRESHOLD - score)
+        score += extra_score
+        matches += extra_matches
+
+    return {'html_smuggle_info': _build(score >= _THRESHOLD, score, matches, features, iocs, truncated)}
+
+
+def _scan(content: str, patterns, max_score: int):
+    score = 0
+    matches: List[Dict] = []
+    for weight, rx, name, category in patterns:
+        if rx.search(content):
+            score += weight
+            matches.append({'name': name, 'category': category, 'weight': weight})
+            if score >= max_score:
+                break
+    return score, matches
+
+
+def _features(content: str) -> Dict:
+    """Surface-level counts -- mirror SmuggleShield's MLDetector feature set."""
+    base64_lengths = [
+        len(m.group(0))
+        for m in re.finditer(r'[A-Za-z0-9+/=]{50,}', content)
+    ]
+    return {
+        'file_size': len(content),
+        'has_blob': bool(re.search(r'\bblob\s*\(', content, re.IGNORECASE)),
+        'has_atob': bool(re.search(r'\batob\s*\(', content, re.IGNORECASE)),
+        'has_uint8array': bool(re.search(r'\buint8array\b', content, re.IGNORECASE)),
+        'has_createobjecturl': bool(re.search(r'createobjecturl', content, re.IGNORECASE)),
+        'has_download_attr': bool(re.search(r'\bdownload\s*=\s*[\'"][^\'"]+[\'"]', content, re.IGNORECASE)),
+        'has_fromcharcode': bool(re.search(r'fromcharcode', content, re.IGNORECASE)),
+        'script_tags': len(re.findall(r'<script\b', content, re.IGNORECASE)),
+        'iframe_tags': len(re.findall(r'<iframe\b', content, re.IGNORECASE)),
+        'embed_tags': len(re.findall(r'<embed\b', content, re.IGNORECASE)),
+        'base64_blob_count': len(base64_lengths),
+        'largest_base64_chars': max(base64_lengths) if base64_lengths else 0,
+    }
+
+
+def _iocs(content: str) -> Dict:
+    """Pull operator-readable artifacts -- attempted download filenames,
+    the largest embedded base64 blob, dataset-based payload tags."""
+    download_names = list({
+        m.group(1)
+        for m in re.finditer(r'\bdownload\s*=\s*[\'"]([^\'"]{1,100})[\'"]', content, re.IGNORECASE)
+    })[:20]
+
+    largest_b64 = ''
+    for m in re.finditer(r'[A-Za-z0-9+/=]{200,}', content):
+        blob = m.group(0)
+        if len(blob) > len(largest_b64):
+            largest_b64 = blob
+            if len(largest_b64) > 50000:
+                break
+
+    data_file_attrs = list({
+        m.group(1)[:200]
+        for m in re.finditer(r'\bdata-file\s*=\s*[\'"]([A-Za-z0-9+/=]{20,})[\'"]', content, re.IGNORECASE)
+    })[:10]
+
+    return {
+        'download_filenames': download_names,
+        'data_file_attrs': data_file_attrs,
+        'largest_base64_blob': {
+            'length': len(largest_b64),
+            'preview_first_120': largest_b64[:120],
+            'preview_last_120': largest_b64[-120:] if len(largest_b64) > 120 else '',
+        } if largest_b64 else None,
+    }
+
+
+def _build(is_smuggling: bool, score: int, matches, features, iocs, truncated: bool) -> Dict:
+    by_category: Dict[str, int] = {}
+    for m in matches:
+        by_category[m['category']] = by_category.get(m['category'], 0) + 1
+
+    notes: List[str] = []
+    if is_smuggling:
+        notes.append(
+            f"HTML smuggling detected -- pattern score {score} >= threshold {_THRESHOLD} "
+            f"({len(matches)} pattern{'s' if len(matches) != 1 else ''} fired)"
+        )
+    elif score > 0:
+        notes.append(f"Suspicious patterns present but below threshold ({score}/{_THRESHOLD})")
+    if features.get('largest_base64_chars', 0) >= 1000:
+        notes.append(
+            f"Large base64 blob present ({features['largest_base64_chars']} chars) "
+            f"-- typical of smuggled binary payload"
+        )
+    if features.get('has_download_attr') and features.get('has_blob'):
+        notes.append("Combination of <a download> + Blob -- classic smuggling-writer chain")
+    if features.get('has_atob') and features.get('has_uint8array'):
+        notes.append("atob() + Uint8Array decode chain present")
+
+    return {
+        'is_smuggling': is_smuggling,
+        'score': score,
+        'threshold': _THRESHOLD,
+        'matched_patterns': matches,
+        'matched_categories': by_category,
+        'features': features,
+        'iocs': iocs,
+        'truncated': truncated,
+        'detection_notes': notes,
+    }
@@ -0,0 +1,29 @@
+# app/utils/lnk.py
+"""Windows shortcut (.lnk) analyzer.
+
+Runs at upload time on .lnk files (alongside `get_pe_info`, `get_office_info`,
+`get_html_smuggle_info`). Output lands in `file_info.lnk_info`.
+
+Heavy lifting is in `app.analyzers.static.lnk_parser.LnkForensics`; this
+module is a thin wrapper that adapts the parser to the file_io drop-in
+contract (returns `{lnk_info: {...}}` ready for `file_info.update(...)`).
+"""
+
+import logging
+from typing import Dict
+
+from ..analyzers.static.lnk_parser import LnkForensics
+
+logger = logging.getLogger(__name__)
+
+
+def get_lnk_info(filepath: str) -> Dict:
+    """Public entry. Returns `{lnk_info: <dict or None>}`."""
+    try:
+        lnk = LnkForensics(filepath)
+        if not lnk.is_valid():
+            return {'lnk_info': None}
+        return {'lnk_info': lnk.get_forensic_data()}
+    except Exception as e:
+        logger.warning(f"LNK analysis failed on {filepath}: {e}")
+        return {'lnk_info': None}
@@ -0,0 +1,220 @@
+# app/utils/office.py
+"""Office document analyzer.
+
+Runs at upload time on Word / Excel / RTF / legacy CFBF binaries (alongside
+`get_pe_info`, `get_lnk_info`, `get_html_smuggle_info`). Output lands in
+`file_info.office_info`.
+
+Two analysis branches:
+
+  1. olevba -- VBA / XLM macros embedded in the file. Pulls per-module
+     source, autoexec triggers, suspicious keyword hits, IOCs.
+
+  2. OOXML rels inspection -- external `attachedTemplate` / `oleObject` /
+     `subDocument` / `frame` references. Catches T1221 (Remote Template
+     Injection) which is invisible to olevba because the malicious VBA
+     lives in a remote .dotm, not in the file itself. Atomic Red Team's
+     `Calculator.docx` is the canonical example.
+"""
+
+import logging
+import xml.etree.ElementTree as ET
+import zipfile
+from typing import Dict, List
+
+from oletools.olevba import VBA_Parser
+
+logger = logging.getLogger(__name__)
+
+
+# Relationship Types we care about when they target an external (HTTP/UNC)
+# resource. `attachedTemplate` is the T1221 vector. The others pull remote
+# content the same way; less common but the same class of risk.
+_INTERESTING_RELS = (
+    'attachedTemplate',
+    'oleObject',
+    'subDocument',
+    'frame',
+    'image',          # rare but seen in malicious docs that fetch tracking pixels
+    'hyperlink',
+)
+
+
+def get_office_info(filepath: str, malapi_path=None) -> Dict:
+    """Public entry. Returns `{office_info: {...}}` -- mirrors `get_lnk_info` /
+    `get_html_smuggle_info` shape so file_io can do `file_info.update(result)`
+    without conditionals.
+
+    `malapi_path` is accepted for back-compat with the old SecurityAnalyzer
+    delegation but isn't used here -- the office analyzer doesn't need
+    MalAPI lookups.
+    """
+    info = {
+        'file_type': 'Microsoft Office Document',
+        'has_macros': False,
+        'modules': [],          # [{stream, vba_filename, code}]
+        'analysis': {
+            'autoexec': [],     # [{keyword, description}] auto-execution triggers
+            'suspicious': [],   # [{keyword, description}] suspicious keyword hits
+            'iocs': [],         # [{type, value}] extracted URLs / IPs / EXEs / etc.
+            'hex_strings': [],
+            'base64_strings': [],
+            'vba_strings': [],
+        },
+        'external_refs': [],    # external relationship targets (T1221 etc.)
+        'detection_notes': [],
+    }
+
+    _run_olevba(filepath, info)
+    _run_external_rels(filepath, info)
+
+    return {'office_info': info}
+
+
+def _run_olevba(filepath: str, info: Dict) -> None:
+    """Branch 1 -- VBA / XLM macro analysis via oletools.olevba."""
+    try:
+        vbaparser = VBA_Parser(filepath)
+    except Exception as e:
+        logger.warning(f"olevba init failed on {filepath}: {e}")
+        return
+
+    try:
+        if not vbaparser.detect_vba_macros():
+            return
+
+        info['has_macros'] = True
+
+        # Per-module source code: (filename, stream_path, vba_filename, vba_code)
+        for _, stream, vba_fname, vba_code in vbaparser.extract_macros():
+            if vba_code:
+                info['modules'].append({
+                    'stream': stream,
+                    'vba_filename': vba_fname,
+                    'code': vba_code,
+                })
+
+        # Structured analysis -- olevba returns (kw_type, keyword, description)
+        for kw_type, keyword, description in vbaparser.analyze_macros():
+            kt = (kw_type or '').lower()
+            entry = {'keyword': keyword, 'description': description}
+            if kt == 'autoexec':
+                info['analysis']['autoexec'].append(entry)
+            elif kt == 'suspicious':
+                info['analysis']['suspicious'].append(entry)
+            elif kt == 'iocs':
+                info['analysis']['iocs'].append({'type': keyword, 'value': description})
+            elif kt == 'hex string':
+                info['analysis']['hex_strings'].append(entry)
+            elif kt == 'base64 string':
+                info['analysis']['base64_strings'].append(entry)
+            elif kt in ('vba string', 'vba_string'):
+                info['analysis']['vba_strings'].append(entry)
+
+        a = info['analysis']
+        if a['autoexec']:
+            info['detection_notes'].append(
+                f"{len(a['autoexec'])} auto-execution trigger"
+                f"{'s' if len(a['autoexec']) != 1 else ''} detected"
+            )
+        if a['suspicious']:
+            info['detection_notes'].append(
+                f"{len(a['suspicious'])} suspicious keyword"
+                f"{'s' if len(a['suspicious']) != 1 else ''} in macro body"
+            )
+        if a['iocs']:
+            info['detection_notes'].append(
+                f"{len(a['iocs'])} IOC"
+                f"{'s' if len(a['iocs']) != 1 else ''} extracted from macro"
+            )
+    except Exception as e:
+        logger.warning(f"olevba analysis failed on {filepath}: {e}")
+    finally:
+        try:
+            vbaparser.close()
+        except Exception:
+            pass
+
+
+def _run_external_rels(filepath: str, info: Dict) -> None:
+    """Branch 2 -- T1221 / external-relationship inspection."""
+    try:
+        external = _scan_external_relationships(filepath)
+    except Exception as e:
+        logger.warning(f"External-rels scan failed on {filepath}: {e}")
+        return
+
+    if not external:
+        return
+
+    info['external_refs'] = external
+
+    t1221 = [r for r in external if r['relationship'] == 'attachedTemplate']
+    if t1221:
+        info['detection_notes'].append(
+            f"MITRE T1221: Remote Template Injection -- {len(t1221)} "
+            f"external `attachedTemplate` reference"
+            f"{'s' if len(t1221) != 1 else ''}. "
+            f"Malicious VBA likely lives in the remote target, not in this file."
+        )
+
+    ole_remote = [r for r in external if r['relationship'] == 'oleObject']
+    if ole_remote:
+        info['detection_notes'].append(
+            f"{len(ole_remote)} external OLE-object reference"
+            f"{'s' if len(ole_remote) != 1 else ''} -- remote-fetched embedded payload"
+        )
+
+    subdoc = [r for r in external if r['relationship'] == 'subDocument']
+    if subdoc:
+        info['detection_notes'].append(
+            f"{len(subdoc)} external subDocument reference"
+            f"{'s' if len(subdoc) != 1 else ''}"
+        )
+
+
+def _scan_external_relationships(filepath: str) -> List[Dict]:
+    """Walk every `*.rels` file inside an OOXML container and return the list
+    of relationships whose `TargetMode` is `External` AND whose Type is one
+    of `_INTERESTING_RELS`. Returns `[]` for non-zip files (legacy CFBF
+    .doc/.xls binaries).
+    """
+    if not zipfile.is_zipfile(filepath):
+        return []
+
+    findings: List[Dict] = []
+    try:
+        with zipfile.ZipFile(filepath) as z:
+            rels_files = [n for n in z.namelist() if n.endswith('.rels')]
+            for rels_name in rels_files:
+                try:
+                    data = z.read(rels_name)
+                except Exception:
+                    continue
+                try:
+                    root = ET.fromstring(data)
+                except ET.ParseError:
+                    continue
+
+                for rel in root.iter():
+                    tag = rel.tag.rsplit('}', 1)[-1] if '}' in rel.tag else rel.tag
+                    if tag != 'Relationship':
+                        continue
+                    if rel.attrib.get('TargetMode', '').lower() != 'external':
+                        continue
+                    rel_type = rel.attrib.get('Type', '')
+                    target = rel.attrib.get('Target', '')
+                    rel_name = rel_type.rsplit('/', 1)[-1] if '/' in rel_type else rel_type
+                    if rel_name not in _INTERESTING_RELS:
+                        continue
+                    findings.append({
+                        'rels_file': rels_name,
+                        'relationship': rel_name,
+                        'target': target,
+                        'target_mode': 'External',
+                        'full_type': rel_type,
+                    })
+    except zipfile.BadZipFile:
+        pass
+
+    return findings