diff --git a/CHANGELOG.md b/CHANGELOG.md index c450bdb..0a2d257 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,13 @@ All notable changes to this project will be documented in this file. - Elastic YARA rules synced to upstream `d131ea8` (2026-04-30, 686 rules — 684 upstream + Morpes/Torii retained locally after Elastic rotated them out) - YARA-Forge bumped to 0.9.1 (release `20260503`, 2026-05-03) — separate `YARAForge_Extended.yar` pack alongside the Elastic rules +### File-type analyzers +- HTML smuggling analyzer (`app/utils/htmlsmuggle.py`) — pattern set + scoring model ported from RootUp/SmuggleShield. Runs at upload time on `.html` / `.htm` files. Catches in-page payload assembly (atob → Uint8Array → Blob → URL.createObjectURL → `` click), GWT smuggling artifacts, WebAssembly drop chains, dataset-driven payload tags, and ~80 other regex signatures. Output lands in `file_info.html_smuggle_info`. +- Office macro detail surfacing — the existing olevba pipeline now exposes per-module VBA source, autoexec triggers, suspicious keyword hits, and IOCs as structured tables on the upload-result page (was previously only a one-line "5 auto-execution triggers detected" summary). +- T1221 Remote Template Injection detection — `_scan_external_relationships` walks every OOXML container's `*.rels` files looking for external `attachedTemplate` / `oleObject` / `subDocument` / `frame` references. Catches Atomic Red Team's `Calculator.docx` (and the wider class) where `has_macros: false` but the malicious VBA lives in a remote `.dotm`. +- File-type analyzers split into dedicated modules — `utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`. `forensics.py` is now strictly PE / MalAPI / entropy. Re-exports preserved through `app/utils/__init__.py` so existing call sites keep working. +- `allowed_extensions` expanded to cover macro-enabled Office (`docm`, `dotm`, `xlsm`, `xltm`), legacy CFBF binaries (`doc`, `xls`, `rtf`), and HTML (`html`, `htm`). Upload page now gates analysis tabs by file family: office + html files only show Static (Dynamic / EDR aren't relevant for these without an Office install on the target host); driver files keep the existing static-driver + HolyGrail flow. + ### Notes - New runtime dependency: `requests==2.32.3` - Whiskers binary not committed — build via `cargo build --release` (see `Whiskers/BUILD.md`) diff --git a/Config/config.yaml b/Config/config.yaml index cafb6fd..aa7e5d3 100644 --- a/Config/config.yaml +++ b/Config/config.yaml @@ -8,13 +8,26 @@ application: utils: allowed_extensions: + # Executables / loaders / drivers - exe - dll - bin - - docx - - xlsx - - lnk - sys + - lnk + # Word (OOXML + legacy CFBF) + - docx # no macros per spec, but still uploadable for T1221 template injection samples + - docm # macro-enabled + - dotm # template macro-enabled (T1221 target) + - doc # Word 97-2003 -- frequently weaponised with VBA macros + - rtf # Rich Text Format -- OLE-embedded payloads, T1203 patterns + # Excel (OOXML + legacy CFBF) + - xlsx # no macros per spec; still routed through olevba in case of XLM smuggling + - xlsm # macro-enabled + - xltm # template macro-enabled + - xls # Excel 97-2003 -- carries VBA + Excel 4.0 / XLM macros + # HTML / HTML Application -- routed through the SmuggleShield-derived static analyzer + - html # plain HTML + - htm # alternate HTML extension max_file_size: 104857600 # 100MB in bytes upload_folder: "Uploads" result_folder: "Results" diff --git a/app/static/js/upload/core.js b/app/static/js/upload/core.js index 66f4f4f..7bcdea1 100644 --- a/app/static/js/upload/core.js +++ b/app/static/js/upload/core.js @@ -64,6 +64,10 @@ document.addEventListener('DOMContentLoaded', function() { officeInfo: document.getElementById('officeInfo'), macroStatus: document.getElementById('macroStatus'), macroDetectionNotes: document.getElementById('macroDetectionNotes'), + htmlSmuggleInfo: document.getElementById('htmlSmuggleInfo'), + smuggleStatus: document.getElementById('smuggleStatus'), + smuggleDetectionNotes: document.getElementById('smuggleDetectionNotes'), + smuggleInfo: document.getElementById('smuggleInfo'), checksumInfo: document.getElementById('checksumInfo'), checksumStatus: document.getElementById('checksumStatus'), storedChecksum: document.getElementById('storedChecksum'), @@ -193,19 +197,42 @@ document.addEventListener('DOMContentLoaded', function() { // // The analysis-mode selector is a single segmented control with one tab // per mode (Static / Dynamic / each EDR profile / HolyGrail). Each tab - // is tagged data-family="regular" or "driver"; we only show the family - // matching the uploaded file. The first visible tab becomes active. + // is tagged with one or more `data-family` values (space-separated) and + // only tabs matching the uploaded file's family are shown. + // + // Four families: + // driver -- .sys (-> static-driver, holygrail) + // office -- Word / Excel macro-bearing documents (-> static only; + // dynamic / EDR don't make sense without an Office install + // on the target host -- olevba is the relevant scanner) + // html -- .html / .htm (-> static only; SmuggleShield-derived + // pattern analyzer runs at upload time as html_smuggle_info) + // regular -- everything else (-> all / static / dynamic / edr:*) + const DRIVER_EXTS = new Set(['sys']); + const OFFICE_EXTS = new Set([ + 'docx', 'docm', 'dotm', 'doc', 'rtf', + 'xlsx', 'xlsm', 'xltm', 'xls', + ]); + const HTML_EXTS = new Set(['html', 'htm']); + function updateAnalysisOptions(fileExtension) { - isDriverFile = fileExtension.toLowerCase() === 'sys'; - const family = isDriverFile ? 'driver' : 'regular'; + const ext = (fileExtension || '').toLowerCase(); + isDriverFile = DRIVER_EXTS.has(ext); + const family = isDriverFile ? 'driver' + : OFFICE_EXTS.has(ext) ? 'office' + : HTML_EXTS.has(ext) ? 'html' + : 'regular'; const tabs = document.querySelectorAll('#modeTabs .lb-tab'); const bodies = document.querySelectorAll('.lb-mode-body'); - // Show only tabs for this file family; active state moves to first. + // Show only tabs whose `data-family` list contains this file's family. + // Multiple families are space-separated (e.g. `regular office` for the + // Static tab, which serves both classes). let firstVisible = null; tabs.forEach(t => { - const matches = t.dataset.family === family; + const families = (t.dataset.family || '').split(/\s+/); + const matches = families.includes(family); t.classList.toggle('hidden', !matches); t.classList.remove('active'); if (matches && !firstVisible) firstVisible = t; @@ -245,6 +272,7 @@ document.addEventListener('DOMContentLoaded', function() { function renderFileTypeSpecificInfo(fileInfo) { elements.peInfo.classList.add('hidden'); elements.officeInfo.classList.add('hidden'); + if (elements.htmlSmuggleInfo) elements.htmlSmuggleInfo.classList.add('hidden'); elements.suspiciousImports.classList.add('hidden'); if (fileInfo.entropy_analysis) { @@ -359,24 +387,7 @@ document.addEventListener('DOMContentLoaded', function() { } else if (fileInfo.office_info) { elements.officeInfo.classList.remove('hidden'); - const office = fileInfo.office_info; - - elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${ - office.has_macros ? 'bg-red-500/8 text-red-300 border border-red-500/22' : 'bg-green-500/8 text-green-300 border border-green-500/22' - }`; - elements.macroStatus.textContent = office.has_macros ? 'Macros Present' : 'No Macros'; - - if (office.detection_notes && office.detection_notes.length > 0) { - elements.macroDetectionNotes.innerHTML = office.detection_notes.map(note => ` -
- - - - ${note} -
- `).join(''); - } + renderOfficeInfo(fileInfo.office_info); } else if (fileInfo.lnk_info) { // Show LNK-specific information section @@ -384,6 +395,293 @@ document.addEventListener('DOMContentLoaded', function() { lnkInfoSection.classList.remove('hidden'); renderLnkInfo(fileInfo.lnk_info); } + else if (fileInfo.html_smuggle_info) { + const htmlSection = document.getElementById('htmlSmuggleInfo'); + if (htmlSection) htmlSection.classList.remove('hidden'); + renderHtmlSmuggleInfo(fileInfo.html_smuggle_info); + } + } + + // -- Office macro / template-injection rendering -------------------- + // + // Surfaces every non-empty piece of the `office_info` structure: + // * Status pill: Macros Present / No Macros + // * Detection notes (one-line summaries) + // * Autoexec triggers (table: keyword + description) + // * Suspicious keywords (table: keyword + description) + // * IOCs (table: type + value) + // * External refs (table: relationship + target -- T1221 etc.) + // * Per-module VBA source code (collapsible
) + // * Hex / Base64 / VBA strings (collapsible) + // + // The DOM container (#officeInfo) already exists in upload.html; this + // function rewrites #macroDetectionNotes (status notes) and #macroInfo + // (detail blocks) every time it runs. + function escapeHtml(s) { + return String(s ?? '').replace(/[&<>"']/g, c => ( + { '&': '&', '<': '<', '>': '>', '"': '"', "'": ''' }[c] + )); + } + + function macroSeverityClass(office) { + // Treat external attachedTemplate references and live macros as the + // strong signals. Everything else goes "info". + if (office.has_macros) return 'critical'; + if ((office.external_refs || []).some(r => r.relationship === 'attachedTemplate')) return 'critical'; + if ((office.external_refs || []).length > 0) return 'medium'; + return 'low'; + } + + function renderTable(headers, rows) { + if (!rows.length) return ''; + const head = headers.map(h => `${escapeHtml(h)}`).join(''); + const body = rows.map(r => `${r.map(c => `${c}`).join('')}`).join(''); + return `${head}${body}
`; + } + + function renderSection(title, body, opts) { + opts = opts || {}; + if (!body) return ''; + const collapsible = opts.collapsible; + const open = opts.open === undefined ? false : opts.open; + const heading = `
${escapeHtml(title)}
`; + if (collapsible) { + return `${heading}
${escapeHtml(opts.summary || 'show')}${body}
`; + } + return `${heading}${body}`; + } + + function renderOfficeInfo(office) { + // Status pill + const sev = macroSeverityClass(office); + const sevClassMap = { + critical: 'bg-red-500/8 text-red-300 border border-red-500/22', + medium: 'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22', + low: 'bg-green-500/8 text-green-300 border border-green-500/22', + }; + elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`; + elements.macroStatus.textContent = office.has_macros + ? 'Macros Present' + : ((office.external_refs || []).length > 0 ? 'External Refs' : 'No Macros'); + + // Top-level detection notes (one-line summaries) + const notes = office.detection_notes || []; + elements.macroDetectionNotes.innerHTML = notes.map(note => ` +
+ + ${escapeHtml(note)} +
+ `).join(''); + + // Detailed sections + const macroInfo = document.getElementById('macroInfo'); + if (!macroInfo) return; + const parts = []; + + // External references (T1221 etc.) -- shown FIRST when present + // since they're often the only signal for documents that have no VBA. + const refs = office.external_refs || []; + if (refs.length > 0) { + const rows = refs.map(r => [ + `${escapeHtml(r.relationship)}`, + `
${escapeHtml(r.target)}`, + `${escapeHtml(r.rels_file)}`, + ]); + parts.push(renderSection('External References (Remote Targets)', renderTable(['Relationship', 'Target', 'In .rels'], rows))); + } + + const a = office.analysis || {}; + + // Autoexec triggers + if ((a.autoexec || []).length > 0) { + const rows = a.autoexec.map(e => [ + `${escapeHtml(e.keyword || '?')}`, + `${escapeHtml(e.description || '')}`, + ]); + parts.push(renderSection(`Auto-Execution Triggers (${a.autoexec.length})`, renderTable(['Keyword', 'Description'], rows))); + } + + // Suspicious keywords + if ((a.suspicious || []).length > 0) { + const rows = a.suspicious.map(e => [ + `${escapeHtml(e.keyword || '?')}`, + `${escapeHtml(e.description || '')}`, + ]); + parts.push(renderSection(`Suspicious Keywords (${a.suspicious.length})`, renderTable(['Keyword', 'Description'], rows))); + } + + // IOCs (URLs, IPs, EXEs, etc. that olevba pulled out of the macro body) + if ((a.iocs || []).length > 0) { + const rows = a.iocs.map(ioc => [ + `${escapeHtml(ioc.type || '?')}`, + `${escapeHtml(ioc.value || '')}`, + ]); + parts.push(renderSection(`IOCs Extracted from Macro (${a.iocs.length})`, renderTable(['Type', 'Value'], rows))); + } + + // Hex / Base64 / VBA-encoded strings (decoded by olevba) + const stringSets = [ + ['Hex Strings', a.hex_strings || []], + ['Base64 Strings', a.base64_strings || []], + ['VBA-Encoded Strings', a.vba_strings || []], + ]; + for (const [label, items] of stringSets) { + if (items.length === 0) continue; + const body = items.map(e => `
${escapeHtml(e.keyword || '')}: ${escapeHtml(e.description || '')}
`).join(''); + parts.push(renderSection(`${label} (${items.length})`, body, { collapsible: true, summary: `${items.length} item(s) -- click to expand` })); + } + + // Per-module VBA source code -- collapsible + const modules = office.modules || []; + if (modules.length > 0) { + const body = modules.map(m => ` +
+
+ ${escapeHtml(m.vba_filename || '?')} + -- ${escapeHtml(m.stream || '')} +
+
${escapeHtml(m.code || '')}
+
+ `).join(''); + parts.push(renderSection(`VBA Source (${modules.length} module${modules.length !== 1 ? 's' : ''})`, body, { collapsible: true, summary: `${modules.length} module(s) -- click to view source code` })); + } + + macroInfo.innerHTML = parts.join(''); + } + + // -- HTML smuggling rendering -------------------------------------- + // + // Surfaces every non-empty piece of the `html_smuggle_info` structure + // produced by app/utils/htmlsmuggle.py: + // * Status pill: SMUGGLING / SUSPICIOUS / CLEAN with score + // * Detection notes (one-line summaries) + // * Score bar + matched-categories pill row + // * Matched patterns (table: name + category + weight) + // * Surface features (table: feature + value) + // * IOCs (download filenames, dataset blobs, largest base64 preview) + // + // Reuses the renderTable / renderSection / escapeHtml helpers defined + // for the office macro renderer. + function smuggleSeverityClass(h) { + if (h.is_smuggling) return 'critical'; + if ((h.score || 0) > 0) return 'medium'; + return 'low'; + } + + function renderHtmlSmuggleInfo(h) { + // Status pill + const sev = smuggleSeverityClass(h); + const sevClassMap = { + critical: 'bg-red-500/8 text-red-300 border border-red-500/22', + medium: 'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22', + low: 'bg-green-500/8 text-green-300 border border-green-500/22', + }; + if (elements.smuggleStatus) { + elements.smuggleStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`; + const label = h.is_smuggling + ? `SMUGGLING (score ${h.score}/${h.threshold})` + : (h.score > 0 ? `SUSPICIOUS (score ${h.score}/${h.threshold})` : 'CLEAN'); + elements.smuggleStatus.textContent = label; + } + + // Detection notes + const notes = h.detection_notes || []; + if (elements.smuggleDetectionNotes) { + elements.smuggleDetectionNotes.innerHTML = notes.map(note => ` +
+ + ${escapeHtml(note)} +
+ `).join(''); + } + + // Detail blocks + const host = elements.smuggleInfo; + if (!host) return; + const parts = []; + + // Score line + matched-category pills + const cats = h.matched_categories || {}; + if (Object.keys(cats).length > 0) { + const pills = Object.entries(cats).map(([cat, count]) => + `${escapeHtml(cat)} × ${count}` + ).join(' '); + parts.push(renderSection('Pattern Categories', `
${pills}
`)); + } + + // Matched patterns -- the actual signatures that fired + const matches = h.matched_patterns || []; + if (matches.length > 0) { + const rows = matches.map(m => [ + `${escapeHtml(m.name)}`, + `${escapeHtml(m.category || '?')}`, + `+${m.weight || 0}`, + ]); + parts.push(renderSection(`Matched Patterns (${matches.length})`, renderTable(['Pattern', 'Category', 'Weight'], rows))); + } + + // Surface features + const f = h.features || {}; + if (Object.keys(f).length > 0) { + const featureRows = [ + ['File size (bytes)', f.file_size], + ['Script tags', f.script_tags], + ['iframe tags', f.iframe_tags], + ['embed tags', f.embed_tags], + ['Base64 blob count (>=50 chars)', f.base64_blob_count], + ['Largest base64 blob (chars)', f.largest_base64_chars], + ['Has blob()', f.has_blob], + ['Has atob()', f.has_atob], + ['Has Uint8Array', f.has_uint8array], + ['Has URL.createObjectURL', f.has_createobjecturl], + ['Has ', f.has_download_attr], + ['Has String.fromCharCode', f.has_fromcharcode], + ].filter(([, v]) => v !== undefined && v !== null && v !== false && v !== 0) + .map(([label, v]) => [ + `${escapeHtml(label)}`, + `${escapeHtml(String(v))}`, + ]); + if (featureRows.length > 0) { + parts.push(renderSection('Surface Features', renderTable(['Feature', 'Value'], featureRows))); + } + } + + // IOCs + const iocs = h.iocs || {}; + const iocBits = []; + if ((iocs.download_filenames || []).length > 0) { + const rows = iocs.download_filenames.map(name => [ + `download=`, + `${escapeHtml(name)}`, + ]); + iocBits.push(renderTable(['Type', 'Value'], rows)); + } + if ((iocs.data_file_attrs || []).length > 0) { + const rows = iocs.data_file_attrs.map(d => [ + `data-file=`, + `${escapeHtml(d)}`, + ]); + iocBits.push(renderTable(['Type', 'Value (truncated)'], rows)); + } + if (iocs.largest_base64_blob && iocs.largest_base64_blob.length > 0) { + const b = iocs.largest_base64_blob; + iocBits.push(` +
+
Largest base64 blob: ${b.length} chars
+
First 120: ${escapeHtml(b.preview_first_120)}
+ ${b.preview_last_120 ? `
Last 120: ${escapeHtml(b.preview_last_120)}
` : ''} +
+ `); + } + if (iocBits.length > 0) { + parts.push(renderSection('IOCs', iocBits.join(''))); + } + + if (h.truncated) { + parts.push(`
⚠ Scan was truncated -- file exceeds the 5 MiB cap.
`); + } + + host.innerHTML = parts.join(''); } function getRuntimeConfig(buildWith) { diff --git a/app/templates/upload.html b/app/templates/upload.html index e402b02..96488f8 100644 --- a/app/templates/upload.html +++ b/app/templates/upload.html @@ -75,7 +75,7 @@ All - @@ -363,6 +363,16 @@
+ + +
diff --git a/app/utils/__init__.py b/app/utils/__init__.py index b142618..28af330 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -8,8 +8,6 @@ Prefer importing directly from submodules in new code: from .file_io import ( FileTypeDetector, detect_file_type, - get_lnk_info, - get_office_info, get_pe_info, save_uploaded_file, ) @@ -19,6 +17,9 @@ from .forensics import ( calculate_entropy, get_security_analyzer, ) +from .htmlsmuggle import get_html_smuggle_info +from .lnk import get_lnk_info +from .office import get_office_info from .json_helpers import ( extract_detection_counts, format_hex, @@ -41,7 +42,7 @@ __all__ = [ 'allowed_file', 'calculate_entropy', 'calculate_risk', 'calculate_yara_risk', 'check_tool', 'detect_file_type', 'extract_detection_counts', 'find_file_by_hash', 'format_hex', 'format_size', 'generate_html_report', - 'get_entropy_risk_level', 'get_lnk_info', 'get_office_info', 'get_pe_info', - 'get_risk_level', 'get_security_analyzer', 'load_json_file', - 'save_uploaded_file', 'validate_pid', + 'get_entropy_risk_level', 'get_html_smuggle_info', 'get_lnk_info', + 'get_office_info', 'get_pe_info', 'get_risk_level', + 'get_security_analyzer', 'load_json_file', 'save_uploaded_file', 'validate_pid', ] diff --git a/app/utils/file_io.py b/app/utils/file_io.py index 18e54f3..04bd1bf 100644 --- a/app/utils/file_io.py +++ b/app/utils/file_io.py @@ -1,5 +1,15 @@ # app/utils/file_io.py -"""File ingestion: type detection, PE/Office/LNK metadata, upload handling.""" +"""File ingestion: type detection, upload handling. + +Per-file-type inspectors (PE / Office / LNK / HTML-smuggling) are dispatched +from `save_uploaded_file` based on the detected family. Each inspector lives +in its own module: + + * PE -- get_pe_info (this module, uses forensics.SecurityAnalyzer) + * Office -- utils/office.py (get_office_info) + * LNK -- utils/lnk.py (get_lnk_info) + * HTML smuggling -- utils/htmlsmuggle.py (get_html_smuggle_info) +""" import datetime import hashlib import json @@ -11,8 +21,10 @@ import struct import pefile from werkzeug.utils import secure_filename -from ..analyzers.static.lnk_parser import LnkForensics from .forensics import calculate_entropy, get_security_analyzer +from .htmlsmuggle import get_html_smuggle_info +from .lnk import get_lnk_info +from .office import get_office_info from .risk_analyzer import RiskCalculator @@ -42,6 +54,12 @@ class FileTypeDetector: elif header.startswith(cls.LNK_HEADER): return cls._detect_lnk_type(filepath) + # HTML / HTM detection -- file-extension based since HTML has no + # consistent magic. Cheap to check after the binary-header tests + # already missed. + if p.suffix.lower() in ('.html', '.htm'): + return {"family": "html", "type": p.suffix.lower().lstrip('.')} + return {"family": "unknown", "type": "unknown"} except Exception as e: @@ -142,8 +160,20 @@ class FileTypeDetector: "visio/document.xml": "vsdx", } + # Flag macro-enabled OOXML by presence of vbaProject.bin -- + # promotes docx/xlsx/pptx -> docm/xlsm/pptm so the dashboard + # Type field reflects what's actually in the container. + has_vba = any(n.endswith("vbaproject.bin") for n in names) + macro_enabled_map = { + "docx": "docm", + "xlsx": "xlsm", + "pptx": "pptm", + } + for path, file_type in ooxml_types.items(): if path in names: + if has_vba and file_type in macro_enabled_map: + file_type = macro_enabled_map[file_type] return {"family": "office", "type": file_type} return {"family": "office", "type": "ooxml-unknown"} @@ -331,24 +361,9 @@ def _build_pe_detection_notes(is_valid_checksum, suspicious_imports, return detection_notes -def get_office_info(filepath, malapi_path): - """Analyze Office macros (delegates to SecurityAnalyzer).""" - return get_security_analyzer(malapi_path).analyze_office_macros(filepath) - - -def get_lnk_info(filepath): - """Analyze a Windows .LNK shortcut for forensic data.""" - try: - lnk = LnkForensics(filepath) - if not lnk.is_valid(): - return {'lnk_info': None} - - forensic_data = lnk.get_forensic_data() - return {'lnk_info': forensic_data} - - except Exception as e: - print(f"Error analyzing LNK file: {e}") - return {'lnk_info': None} +# Office / LNK / HTML-smuggling inspectors live in their own modules +# (imported at the top of this file). PE inspection stays here because it's +# tightly coupled to the SecurityAnalyzer cache (MalAPI lookup + entropy). def _build_entropy_analysis(entropy_value): @@ -456,6 +471,11 @@ def save_uploaded_file(file, config): if 'error' not in lnk_result: file_info.update(lnk_result) + elif file_type_info['family'] == 'html': + # Always update -- get_html_smuggle_info returns a usable dict even + # for clean files (just with is_smuggling=false / score=0). + file_info.update(get_html_smuggle_info(filepath)) + with open(os.path.join(result_folder, filename, 'file_info.json'), 'w') as f: json.dump(file_info, f) diff --git a/app/utils/forensics.py b/app/utils/forensics.py index 3b07f10..6db2926 100644 --- a/app/utils/forensics.py +++ b/app/utils/forensics.py @@ -1,11 +1,15 @@ # app/utils/forensics.py -"""PE/Office forensic analysis: entropy, runtime detection, MalAPI lookup.""" +"""PE forensic analysis: entropy, runtime detection, MalAPI lookup. + +Office / LNK / HTML-smuggling analyzers live in their own modules +(`utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`) so each file-type +inspector is self-contained and easy to maintain. This module is now strictly +PE-focused. +""" import json import math from collections import Counter -from oletools.olevba import VBA_Parser - # Known runtime imports for compiled languages — used to flag PE imports as # benign-runtime rather than suspicious. @@ -251,46 +255,6 @@ class SecurityAnalyzer: return sections_info - def analyze_office_macros(self, filepath): - """Inspect Office VBA macros for suspicious patterns.""" - try: - vbaparser = VBA_Parser(filepath) - detection_notes = [] - - info = { - 'file_type': 'Microsoft Office Document', - 'has_macros': vbaparser.detect_vba_macros(), - 'macro_info': None, - 'detection_notes': detection_notes, - } - - if vbaparser.detect_vba_macros(): - macro_analysis = vbaparser.analyze_macros() - info['macro_info'] = macro_analysis - - macro_text = str(macro_analysis).lower() - detection_patterns = { - 'shell': 'Shell command execution detected', - 'wscript': 'WScript execution detected', - 'powershell': 'PowerShell execution detected', - 'http': 'Network communication detected', - 'auto': 'Auto-execution mechanism detected', - 'document_open': 'Document open auto-execution', - 'windowshide': 'Hidden window execution', - 'createobject': 'COM object creation detected', - } - - for pattern, note in detection_patterns.items(): - if pattern in macro_text: - detection_notes.append(note) - - vbaparser.close() - return {'office_info': info} - except Exception as e: - print(f"Error analyzing Office file: {e}") - return {'office_info': None} - - _security_analyzer_cache = {} diff --git a/app/utils/htmlsmuggle.py b/app/utils/htmlsmuggle.py new file mode 100644 index 0000000..d4a7003 --- /dev/null +++ b/app/utils/htmlsmuggle.py @@ -0,0 +1,282 @@ +# app/utils/htmlsmuggle.py +"""HTML-smuggling pattern scanner. + +Runs at upload time on `.html` / `.htm` files (alongside `get_pe_info`, +`get_office_info`, `get_lnk_info`). Output lands in `file_info.html_smuggle_info` +and is rendered on the upload-result page the same way office_info is. + +Pattern set + scoring model ported from SmuggleShield's `content.js` +(https://github.com/RootUp/SmuggleShield). The browser extension catches +runtime behaviour (DOM mutation, blob URL revoke, programmatic
+click); we catch the file-on-disk equivalent by regex-scanning the raw +HTML source. + +Scoring (mirrors SmuggleShield): + - Each pattern carries a weight (2-4). + - High-weight (>=3) patterns scanned first; early-return when the score + crosses the threshold. + - Low-weight (<3) patterns scanned only when high-weight pass landed + within `threshold - 2` of crossing. + - A cheap pre-filter (`atob | blob | base64 | createobjecturl | ...`) + skips files that obviously aren't smuggling. +""" + +import os +import re +from typing import Dict, List + + +# (weight, pattern, name, category) +_PATTERNS = [ + # --- Direct base64 -> binary -> blob path ---------------------------- + (3, r'atob\s*\([^)]+\).*new\s+uint8array', 'atob_to_uint8array', 'encoding'), + (3, r"atob\s*\(\s*['\"]([A-Za-z0-9+/=]{100,})['\"].*\)", 'large_base64_atob', 'encoding'), + (3, r'new\s+blob\s*\(\s*\[\s*(?:data|atob\s*\()', 'blob_from_atob_data', 'blob'), + (4, r"let\s+arrayBuffer\s*=\s*\['0x[0-9a-f]{2}'(?:\s*,\s*'0x[0-9a-f]{2}')+\]", 'hex_array_buffer', 'encoding'), + + # --- Reversed-string fromCharCode obfuscation ------------------------ + (4, r'\["edoCrahCmorf"(?:\s*\[\s*"split"\s*\]\s*\(\s*""\s*\)\s*\[\s*"reverse"\s*\]\s*\(\s*\)\s*\[\s*"join"\s*\]\s*\(\s*""\s*\))', 'reversed_fromcharcode_obf', 'obfuscation'), + (4, r'setTimeout\s*\(\s*\[.*?\]\.map\s*\(\s*.*?=>.*?(?:fromCharCode|edoCrahCmorf).*?\/\s*\d+\s*\)', 'settimeout_fromcharcode', 'obfuscation'), + (3, r'String\s*\[\s*(?:"edoCrahCmorf"|[\'"][^\'\"]+[\'"]\.split\([\'"][\'"]\)\.reverse\(\)\.join\([\'"][\'"]\))\s*\]', 'string_reverse_index', 'obfuscation'), + + # --- Blob -> object URL -> download chain ---------------------------- + (3, r'url\.createobjecturl\s*\(\s*(?:my)?blob\s*\)', 'createobjecturl_from_blob', 'blob'), + (3, r'location(?:\s*\[\s*[\'"]href[\'"]\s*\])?\s*=\s*url', 'location_href_assign', 'writer'), + (2, r'url\.revokeobjecturl\s*\(\s*url\s*\)', 'revokeobjecturl', 'blob'), + (3, r'\.style\s*=\s*[\'"]display:\s*none[\'"].*\.href\s*=.*\.download\s*=', 'hidden_anchor_download', 'writer'), + (3, r'\.click\s*\(\s*\).*url\.revokeobjecturl', 'auto_click_then_revoke', 'writer'), + (3, r'href\s*=\s*["\']data:(?:application/octet-stream|image/svg\+xml);base64,', 'data_url_octet_stream', 'writer'), + + # --- Bracket-string property access (window["a"+"to"+"b"] etc.) ------ + (3, r'window\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]', 'window_bracket_concat', 'obfuscation'), + (4, r'document\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]\s*\(\s*window\s*\[\s*(?:[\'"]at[\'"].*[\'"]o[\'"].*[\'"]b[\'"]\s*\]|\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\])\s*\([\'"][A-Za-z0-9+/=]+[\'"]\)\s*\)', 'document_bracket_atob', 'obfuscation'), + (4, r'var\s+\w+=\w+;?\s*\(function\(\w+,\w+\)\{.*while\(!!\[\]\)\{try\{.*parseint.*\}catch\(\w+\)\{.*\}\}\(.*\)\);?', 'parseint_obfuscator', 'obfuscation'), + + # --- Blob mime-type signatures + writer chain ------------------------ + (3, r'blob\s*\(\s*\[[^\]]+\]\s*,\s*\{\s*type\s*:\s*[\'"](?:application/octet-stream|text/html|octet/stream)[\'"](?:\s*,\s*encoding\s*:\s*[\'"]base64[\'"])?\s*\}\s*\)', 'blob_with_octet_type', 'blob'), + + # --- WebAssembly / Go runtime smuggling ------------------------------ + (3, r'webassembly\s*\.\s*(?:instantiate(?:streaming)?|instance)', 'webassembly_instantiate', 'wasm'), + (2, r'navigator\.serviceworker\.register', 'service_worker_register', 'wasm'), + (2, r'wasm[_-]?exec\.js', 'wasm_exec_js', 'wasm'), + (3, r'\.wasm\b', 'wasm_extension_ref', 'wasm'), + (3, r'new\s+go\s*\(\s*\)', 'go_runtime_new', 'wasm'), + (3, r'go\s*\.\s*run\s*\(', 'go_runtime_run', 'wasm'), + + # --- Embedded srcdoc / iframe + script ------------------------------- + (3, r'srcdoc\s*=\s*["\'][^"\']*]*base64', 'embed_with_base64', 'writer'), + + # --- Decoder helpers + legacy IE save ------------------------------- + (3, r'function\s+(?:b64toarray|xor|base64toarraybuffer)\s*\([^)]*\)\s*\{[\s\S]*?return\s+(?:bytes\.buffer|result);?\}', 'decoder_helper_func', 'encoding'), + (3, r'document\.createelement\([\'"]embed[\'"]\)', 'createelement_embed', 'writer'), + (2, r'\.setattribute\([\'"]src[\'"]\s*,\s*.*\)', 'setattribute_src', 'writer'), + (3, r'window\.navigator\.mssaveoropenblob\s*\(\s*blob\s*,\s*filename\s*\)', 'mssaveoropenblob', 'writer'), + (2, r'(?:window\.)?url\.createobjecturl\s*\(\s*(?:blob|[^)]+)\s*\)', 'generic_createobjecturl', 'blob'), + (2, r'(?:a|element)\.download\s*=\s*(?:filename|[\'"][^\'"]+[\'"])', 'anchor_download_attr', 'writer'), + (2, r'string\.fromcharcode\(.*\)', 'string_fromcharcode', 'encoding'), + (2, r'\.charcodeat\(.*\)', 'charcodeat', 'encoding'), + (3, r'document\.getelementbyid\([\'"]passwordid[\'"]\)\.value', 'password_field_lookup', 'writer'), + (3, r'import\s*\(\s*url\.createobjecturl\s*\(', 'dynamic_import_objurl', 'wasm'), + (3, r'\w+\s*\(\s*\w+\s*\(\s*[\'"][A-Za-z0-9+/=]{50,}[\'"]\s*\)\s*\)', 'nested_call_long_b64', 'encoding'), + (2, r'(?:window\.)?atob\s*\(', 'atob_call', 'encoding'), + (2, r'uint8[aA]rray\s*\(\s*(?:(?!len)[^)])*\)', 'uint8array_constructor', 'encoding'), + (3, r'mssaveoropenblob|mssaveblob', 'mssave_alias', 'writer'), + (3, r'base64toarraybuffer', 'b64_to_arraybuffer_helper', 'encoding'), + (3, r'xmlhttprequest\(\).*\.responsetype\s*=\s*[\'"]arraybuffer[\'"]', 'xhr_arraybuffer_response', 'encoding'), + (3, r'new\s+dataview\(.*\).*\.getuint8\(.*\).*\.setuint8\(', 'dataview_getset_uint8', 'encoding'), + (2, r'[^\w](\w+)\s*=\s*(\w+)\s*\^\s*(\w+)', 'xor_operation', 'encoding'), + (2, r'\.slice\(\s*\w+\s*-\s*\d+\s*,\s*\w+\s*-\s*\d+\s*\)', 'string_slice_offset', 'obfuscation'), + (3, r'for\s*\([^)]+\)\s*\{[^}]*string\.fromcharcode\([^)]+\)', 'loop_fromcharcode', 'encoding'), + + # --- GWT (Google Web Toolkit) smuggling artefacts -------------------- + (4, r'\$wnd\s*=\s*window;\s*\$doc\s*=\s*\$wnd\.document', 'gwt_wnd_doc', 'gwt'), + (4, r'__gwt_(?:isKnownPropertyValue|getMetaProperty|marker|stylesLoaded|scriptsLoaded)', 'gwt_internals', 'gwt'), + (3, r'\$strongName\s*=\s*[\'"][0-9A-F]{32}[\'"]', 'gwt_strong_name', 'gwt'), + (3, r'\$gwt_version\s*=\s*[\'"][0-9.]+[\'"]', 'gwt_version', 'gwt'), + (4, r'(?:function|var)\s+[a-zA-Z$_]+\s*=\s*\{\s*[a-zA-Z$_]+:\s*window,\s*[a-zA-Z$_]+:\s*document\s*\}', 'gwt_window_doc_pair', 'gwt'), + (3, r'\b(?:gwtOnLoad|__gwtStatsEvent|gwtOnLoadFunc)\b', 'gwt_onload', 'gwt'), + (3, r'\.setAttribute\([\'"]__gwt_property[\'"]', 'gwt_property_attr', 'gwt'), + (4, r'document\.createElement\([\'"]script[\'"]\).*?\.src\s*=.*?\.cache\.js', 'gwt_cache_js', 'gwt'), + + # --- Mouse/event-triggered drop chains ------------------------------- + (4, r'(?:document|window)\.on(?:mousemove|load|mouseover)\s*=\s*function\s*\(\s*\)\s*\{[^}]*?data:application/[^}]*?\.click\(\)[^}]*?(?:removeChild|remove)\(', 'mouse_event_drop', 'writer'), + (4, r'(?:window|var|let)\.\w+Triggered\s*=\s*(?:true|false).*?(?:navigator|platform).*?data:application/[^;]+;base64,.*?\.(?:download|click)', 'triggered_flag_drop', 'writer'), + (4, r'navigator\[?["\']platform["\']\]?.*?(?:document|window)\.on\w+.*?data:application/', 'platform_event_drop', 'writer'), + + # --- Generic split/concat/reverse obfuscation ------------------------ + (3, r'\[[\'"][^\'\"]+[\'"]\s*\+\s*[\'"][^\'\"]+[\'"]\]', 'string_concat_index', 'obfuscation'), + (3, r"\[\'[a-z]+\'\s*\+\s*\'[a-z]+\'\]", 'concat_lower_index', 'obfuscation'), + (3, r"\[\s*(?:[\'\"]\w?[\'\"](?:\s*,\s*)?){4,}\s*\]\.join\s*\(\s*[\'\"]*\s*\)", 'array_join_join', 'obfuscation'), + (3, r'const\s+\w+\s*=\s*\[\s*(?:[\'"]\w?[\'"](?:\s*,\s*)?){4,}', 'const_char_array', 'obfuscation'), + (4, r'(\[(?:\][^(]*|\[\])[^(]*|\w+\.)constructor\s*\(\s*([\'"])return\s*\w+\2\s*\)', 'constructor_return', 'obfuscation'), + (4, r'Function\s*\(\s*[\'"]return\s+\w+[\'"](?:\s*\)\s*\(\s*\)|\(\))', 'function_return', 'obfuscation'), + (3, r'\w+\.split\s*\(\s*[\'"][\'\"]?\s*\)\.reverse\s*\(\s*\)\.join\s*\(', 'split_reverse_join', 'obfuscation'), + (3, r'\[\s*\w+\.split\s*\(\s*[\'"][\'"]\s*\)\.reverse\s*\(\s*\)', 'array_split_reverse', 'obfuscation'), + (3, r'setTimeout\s*\(\s*(?:function|\(\)|[^,]+)\s*(?:=>)?\s*\{[\s\S]{10,}?setTimeout\s*\(', 'nested_settimeout', 'obfuscation'), + (4, r'setTimeout\s*\([^{)]*\{[^{}]*setTimeout\s*\([^{)]*\{[^{}]*\}', 'double_settimeout', 'obfuscation'), + (4, r'new\s*\([^)]*\[\s*(?:[\'"][^\'\"]+[\'"]\.split|[\'"]\w+[\'"]\.split)', 'new_with_split_index', 'obfuscation'), + (3, r'\[[^\]]*(?:join|reverse)[^\]]*\]\s*\(\s*(?:\w+|[\'"][^\'"]*[\'"])\s*\)', 'index_join_reverse', 'obfuscation'), + (3, r'\[\s*(?:urlMethod|parts\.join\(\)|[\'"]\w+[\'"]\s*\+)', 'partsjoin_index', 'obfuscation'), + (4, r'\w+\s*\[\s*(?:[\'"][^\'\"]+[\'"](?:\s*\+\s*)?)+\s*\]\s*\(\s*\w+\s*\)', 'concat_call', 'obfuscation'), + + # --- "down" + "load" decomposition (extremely common) ---------------- + (3, r'[\'"]?down[\'"]?\s*\+\s*[\'"]?load[\'"]?', 'down_plus_load', 'obfuscation'), + (4, r"\['down' \+ 'load'\]", 'down_load_bracket_exact', 'obfuscation'), + (4, r'createElement\s*\(\s*[\'"]a[\'"]\s*\)[^}]*?\[\s*[\'"]\w+[\'"]\s*\+\s*[\'"]\w+[\'"]\s*\]', 'createanchor_concat_attr', 'writer'), + (3, r"\['style'\]\['visi' \+ 'bility'\]", 'visibility_concat', 'obfuscation'), + + # --- Chunked-substr + dataset-based payload chains ------------------- + (3, r'function\s+\w+Chunks\s*\([^)]*\)\s*\{[^{}]*for\s*\([^{}]*\)\s*\{[^{}]*substr', 'chunk_substr_loop', 'encoding'), + (2, r'\.substr\s*\(\s*\w+\s*,\s*\w+Size\s*\)', 'substr_size_param', 'encoding'), + (4, r'\(async\s*\(\s*\)\s*=>\s*\{\s*(?:let|var|const)\s+d\s*=.*?(?:document\.getElementById|document\.querySelector).*?dataset.*?\.href\s*=\s*d.*?\.download\s*=.*?\.click\s*\(\s*\)', 'async_dataset_click', 'writer'), + (4, r'\bdocument\.getElementById\s*\(\s*[\'"]data[\'"]\s*\).*?\.dataset\.file.*?createElement\s*\(\s*[\'"]a[\'"]\s*\).*?\.download\s*=', 'data_div_dataset_anchor', 'writer'), + (3, r']*id\s*=\s*["\']data["\'][^>]*data-file\s*=\s*["\'][A-Za-z0-9+/=]{50,}["\'][^>]*>', 'data_div_with_b64', 'writer'), + (4, r'