Add HTML smuggling analyzer, surface macro detail, split per-type modules

This commit is contained in:
BlackSnufkin
2026-05-05 06:01:58 -07:00
parent 5dbcf6892d
commit a609527c0b
10 changed files with 940 additions and 96 deletions
+7
View File
@@ -98,6 +98,13 @@ All notable changes to this project will be documented in this file.
- Elastic YARA rules synced to upstream `d131ea8` (2026-04-30, 686 rules — 684 upstream + Morpes/Torii retained locally after Elastic rotated them out)
- YARA-Forge bumped to 0.9.1 (release `20260503`, 2026-05-03) — separate `YARAForge_Extended.yar` pack alongside the Elastic rules
### File-type analyzers
- HTML smuggling analyzer (`app/utils/htmlsmuggle.py`) — pattern set + scoring model ported from RootUp/SmuggleShield. Runs at upload time on `.html` / `.htm` files. Catches in-page payload assembly (atob → Uint8Array → Blob → URL.createObjectURL → `<a download>` click), GWT smuggling artifacts, WebAssembly drop chains, dataset-driven payload tags, and ~80 other regex signatures. Output lands in `file_info.html_smuggle_info`.
- Office macro detail surfacing — the existing olevba pipeline now exposes per-module VBA source, autoexec triggers, suspicious keyword hits, and IOCs as structured tables on the upload-result page (was previously only a one-line "5 auto-execution triggers detected" summary).
- T1221 Remote Template Injection detection — `_scan_external_relationships` walks every OOXML container's `*.rels` files looking for external `attachedTemplate` / `oleObject` / `subDocument` / `frame` references. Catches Atomic Red Team's `Calculator.docx` (and the wider class) where `has_macros: false` but the malicious VBA lives in a remote `.dotm`.
- File-type analyzers split into dedicated modules — `utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`. `forensics.py` is now strictly PE / MalAPI / entropy. Re-exports preserved through `app/utils/__init__.py` so existing call sites keep working.
- `allowed_extensions` expanded to cover macro-enabled Office (`docm`, `dotm`, `xlsm`, `xltm`), legacy CFBF binaries (`doc`, `xls`, `rtf`), and HTML (`html`, `htm`). Upload page now gates analysis tabs by file family: office + html files only show Static (Dynamic / EDR aren't relevant for these without an Office install on the target host); driver files keep the existing static-driver + HolyGrail flow.
### Notes
- New runtime dependency: `requests==2.32.3`
- Whiskers binary not committed — build via `cargo build --release` (see `Whiskers/BUILD.md`)
+16 -3
View File
@@ -8,13 +8,26 @@ application:
utils:
allowed_extensions:
# Executables / loaders / drivers
- exe
- dll
- bin
- docx
- xlsx
- lnk
- sys
- lnk
# Word (OOXML + legacy CFBF)
- docx # no macros per spec, but still uploadable for T1221 template injection samples
- docm # macro-enabled
- dotm # template macro-enabled (T1221 target)
- doc # Word 97-2003 -- frequently weaponised with VBA macros
- rtf # Rich Text Format -- OLE-embedded payloads, T1203 patterns
# Excel (OOXML + legacy CFBF)
- xlsx # no macros per spec; still routed through olevba in case of XLM smuggling
- xlsm # macro-enabled
- xltm # template macro-enabled
- xls # Excel 97-2003 -- carries VBA + Excel 4.0 / XLM macros
# HTML / HTML Application -- routed through the SmuggleShield-derived static analyzer
- html # plain HTML
- htm # alternate HTML extension
max_file_size: 104857600 # 100MB in bytes
upload_folder: "Uploads"
result_folder: "Results"
+322 -24
View File
@@ -64,6 +64,10 @@ document.addEventListener('DOMContentLoaded', function() {
officeInfo: document.getElementById('officeInfo'),
macroStatus: document.getElementById('macroStatus'),
macroDetectionNotes: document.getElementById('macroDetectionNotes'),
htmlSmuggleInfo: document.getElementById('htmlSmuggleInfo'),
smuggleStatus: document.getElementById('smuggleStatus'),
smuggleDetectionNotes: document.getElementById('smuggleDetectionNotes'),
smuggleInfo: document.getElementById('smuggleInfo'),
checksumInfo: document.getElementById('checksumInfo'),
checksumStatus: document.getElementById('checksumStatus'),
storedChecksum: document.getElementById('storedChecksum'),
@@ -193,19 +197,42 @@ document.addEventListener('DOMContentLoaded', function() {
//
// The analysis-mode selector is a single segmented control with one tab
// per mode (Static / Dynamic / each EDR profile / HolyGrail). Each tab
// is tagged data-family="regular" or "driver"; we only show the family
// matching the uploaded file. The first visible tab becomes active.
// is tagged with one or more `data-family` values (space-separated) and
// only tabs matching the uploaded file's family are shown.
//
// Four families:
// driver -- .sys (-> static-driver, holygrail)
// office -- Word / Excel macro-bearing documents (-> static only;
// dynamic / EDR don't make sense without an Office install
// on the target host -- olevba is the relevant scanner)
// html -- .html / .htm (-> static only; SmuggleShield-derived
// pattern analyzer runs at upload time as html_smuggle_info)
// regular -- everything else (-> all / static / dynamic / edr:*)
const DRIVER_EXTS = new Set(['sys']);
const OFFICE_EXTS = new Set([
'docx', 'docm', 'dotm', 'doc', 'rtf',
'xlsx', 'xlsm', 'xltm', 'xls',
]);
const HTML_EXTS = new Set(['html', 'htm']);
function updateAnalysisOptions(fileExtension) {
isDriverFile = fileExtension.toLowerCase() === 'sys';
const family = isDriverFile ? 'driver' : 'regular';
const ext = (fileExtension || '').toLowerCase();
isDriverFile = DRIVER_EXTS.has(ext);
const family = isDriverFile ? 'driver'
: OFFICE_EXTS.has(ext) ? 'office'
: HTML_EXTS.has(ext) ? 'html'
: 'regular';
const tabs = document.querySelectorAll('#modeTabs .lb-tab');
const bodies = document.querySelectorAll('.lb-mode-body');
// Show only tabs for this file family; active state moves to first.
// Show only tabs whose `data-family` list contains this file's family.
// Multiple families are space-separated (e.g. `regular office` for the
// Static tab, which serves both classes).
let firstVisible = null;
tabs.forEach(t => {
const matches = t.dataset.family === family;
const families = (t.dataset.family || '').split(/\s+/);
const matches = families.includes(family);
t.classList.toggle('hidden', !matches);
t.classList.remove('active');
if (matches && !firstVisible) firstVisible = t;
@@ -245,6 +272,7 @@ document.addEventListener('DOMContentLoaded', function() {
function renderFileTypeSpecificInfo(fileInfo) {
elements.peInfo.classList.add('hidden');
elements.officeInfo.classList.add('hidden');
if (elements.htmlSmuggleInfo) elements.htmlSmuggleInfo.classList.add('hidden');
elements.suspiciousImports.classList.add('hidden');
if (fileInfo.entropy_analysis) {
@@ -359,24 +387,7 @@ document.addEventListener('DOMContentLoaded', function() {
}
else if (fileInfo.office_info) {
elements.officeInfo.classList.remove('hidden');
const office = fileInfo.office_info;
elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${
office.has_macros ? 'bg-red-500/8 text-red-300 border border-red-500/22' : 'bg-green-500/8 text-green-300 border border-green-500/22'
}`;
elements.macroStatus.textContent = office.has_macros ? 'Macros Present' : 'No Macros';
if (office.detection_notes && office.detection_notes.length > 0) {
elements.macroDetectionNotes.innerHTML = office.detection_notes.map(note => `
<div class="flex items-center space-x-2">
<svg class="w-4 h-4 text-yellow-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-3L13.732 4c-.77-1.333-2.694-1.333-3.464 0L3.34 16c-.77 1.333.192 3 1.732 3z"/>
</svg>
<span>${note}</span>
</div>
`).join('');
}
renderOfficeInfo(fileInfo.office_info);
}
else if (fileInfo.lnk_info) {
// Show LNK-specific information section
@@ -384,6 +395,293 @@ document.addEventListener('DOMContentLoaded', function() {
lnkInfoSection.classList.remove('hidden');
renderLnkInfo(fileInfo.lnk_info);
}
else if (fileInfo.html_smuggle_info) {
const htmlSection = document.getElementById('htmlSmuggleInfo');
if (htmlSection) htmlSection.classList.remove('hidden');
renderHtmlSmuggleInfo(fileInfo.html_smuggle_info);
}
}
// -- Office macro / template-injection rendering --------------------
//
// Surfaces every non-empty piece of the `office_info` structure:
// * Status pill: Macros Present / No Macros
// * Detection notes (one-line summaries)
// * Autoexec triggers (table: keyword + description)
// * Suspicious keywords (table: keyword + description)
// * IOCs (table: type + value)
// * External refs (table: relationship + target -- T1221 etc.)
// * Per-module VBA source code (collapsible <details>)
// * Hex / Base64 / VBA strings (collapsible)
//
// The DOM container (#officeInfo) already exists in upload.html; this
// function rewrites #macroDetectionNotes (status notes) and #macroInfo
// (detail blocks) every time it runs.
function escapeHtml(s) {
return String(s ?? '').replace(/[&<>"']/g, c => (
{ '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;' }[c]
));
}
function macroSeverityClass(office) {
// Treat external attachedTemplate references and live macros as the
// strong signals. Everything else goes "info".
if (office.has_macros) return 'critical';
if ((office.external_refs || []).some(r => r.relationship === 'attachedTemplate')) return 'critical';
if ((office.external_refs || []).length > 0) return 'medium';
return 'low';
}
function renderTable(headers, rows) {
if (!rows.length) return '';
const head = headers.map(h => `<th style="text-align:left;padding:4px 8px;border-bottom:1px solid var(--lb-border);font-size:11px;color:var(--lb-text-dim);text-transform:uppercase;letter-spacing:0.5px;">${escapeHtml(h)}</th>`).join('');
const body = rows.map(r => `<tr>${r.map(c => `<td style="padding:4px 8px;font-size:12px;vertical-align:top;border-bottom:1px solid rgba(255,255,255,0.04);">${c}</td>`).join('')}</tr>`).join('');
return `<table style="width:100%;border-collapse:collapse;margin:6px 0 12px 0;"><thead><tr>${head}</tr></thead><tbody>${body}</tbody></table>`;
}
function renderSection(title, body, opts) {
opts = opts || {};
if (!body) return '';
const collapsible = opts.collapsible;
const open = opts.open === undefined ? false : opts.open;
const heading = `<div style="font-size:11px;color:var(--lb-text-dim);text-transform:uppercase;letter-spacing:0.5px;margin:14px 0 4px 0;">${escapeHtml(title)}</div>`;
if (collapsible) {
return `${heading}<details ${open ? 'open' : ''} style="border:1px solid var(--lb-border);padding:8px;border-radius:3px;background:rgba(255,255,255,0.02);"><summary style="cursor:pointer;font-size:12px;color:var(--lb-text);">${escapeHtml(opts.summary || 'show')}</summary>${body}</details>`;
}
return `${heading}${body}`;
}
function renderOfficeInfo(office) {
// Status pill
const sev = macroSeverityClass(office);
const sevClassMap = {
critical: 'bg-red-500/8 text-red-300 border border-red-500/22',
medium: 'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22',
low: 'bg-green-500/8 text-green-300 border border-green-500/22',
};
elements.macroStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`;
elements.macroStatus.textContent = office.has_macros
? 'Macros Present'
: ((office.external_refs || []).length > 0 ? 'External Refs' : 'No Macros');
// Top-level detection notes (one-line summaries)
const notes = office.detection_notes || [];
elements.macroDetectionNotes.innerHTML = notes.map(note => `
<div style="display:flex;align-items:flex-start;gap:6px;margin-bottom:3px;">
<span style="color:var(--lb-warn);">⚠</span>
<span>${escapeHtml(note)}</span>
</div>
`).join('');
// Detailed sections
const macroInfo = document.getElementById('macroInfo');
if (!macroInfo) return;
const parts = [];
// External references (T1221 etc.) -- shown FIRST when present
// since they're often the only signal for documents that have no VBA.
const refs = office.external_refs || [];
if (refs.length > 0) {
const rows = refs.map(r => [
`<span class="lb-tag ${r.relationship === 'attachedTemplate' ? 'critical' : 'medium'}">${escapeHtml(r.relationship)}</span>`,
`<span class="lb-mono" style="word-break:break-all;font-size:11px;"><a href="${escapeHtml(r.target)}" target="_blank" rel="noopener noreferrer" style="color:var(--lb-accent-soft);">${escapeHtml(r.target)}</a></span>`,
`<span class="lb-mono" style="font-size:11px;color:var(--lb-text-dim);">${escapeHtml(r.rels_file)}</span>`,
]);
parts.push(renderSection('External References (Remote Targets)', renderTable(['Relationship', 'Target', 'In .rels'], rows)));
}
const a = office.analysis || {};
// Autoexec triggers
if ((a.autoexec || []).length > 0) {
const rows = a.autoexec.map(e => [
`<span class="lb-tag critical">${escapeHtml(e.keyword || '?')}</span>`,
`<span style="font-size:12px;">${escapeHtml(e.description || '')}</span>`,
]);
parts.push(renderSection(`Auto-Execution Triggers (${a.autoexec.length})`, renderTable(['Keyword', 'Description'], rows)));
}
// Suspicious keywords
if ((a.suspicious || []).length > 0) {
const rows = a.suspicious.map(e => [
`<span class="lb-tag medium">${escapeHtml(e.keyword || '?')}</span>`,
`<span style="font-size:12px;">${escapeHtml(e.description || '')}</span>`,
]);
parts.push(renderSection(`Suspicious Keywords (${a.suspicious.length})`, renderTable(['Keyword', 'Description'], rows)));
}
// IOCs (URLs, IPs, EXEs, etc. that olevba pulled out of the macro body)
if ((a.iocs || []).length > 0) {
const rows = a.iocs.map(ioc => [
`<span class="lb-tag info">${escapeHtml(ioc.type || '?')}</span>`,
`<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(ioc.value || '')}</span>`,
]);
parts.push(renderSection(`IOCs Extracted from Macro (${a.iocs.length})`, renderTable(['Type', 'Value'], rows)));
}
// Hex / Base64 / VBA-encoded strings (decoded by olevba)
const stringSets = [
['Hex Strings', a.hex_strings || []],
['Base64 Strings', a.base64_strings || []],
['VBA-Encoded Strings', a.vba_strings || []],
];
for (const [label, items] of stringSets) {
if (items.length === 0) continue;
const body = items.map(e => `<div class="lb-mono" style="word-break:break-all;font-size:11px;padding:3px 0;border-bottom:1px solid rgba(255,255,255,0.04);"><strong>${escapeHtml(e.keyword || '')}:</strong> ${escapeHtml(e.description || '')}</div>`).join('');
parts.push(renderSection(`${label} (${items.length})`, body, { collapsible: true, summary: `${items.length} item(s) -- click to expand` }));
}
// Per-module VBA source code -- collapsible
const modules = office.modules || [];
if (modules.length > 0) {
const body = modules.map(m => `
<div style="margin-top:8px;">
<div style="font-size:12px;color:var(--lb-text);margin-bottom:4px;">
<span class="lb-mono" style="color:var(--lb-accent-soft);">${escapeHtml(m.vba_filename || '?')}</span>
<span class="lb-muted" style="font-size:11px;"> -- ${escapeHtml(m.stream || '')}</span>
</div>
<pre style="background:rgba(0,0,0,0.3);padding:8px;border:1px solid var(--lb-border);font-size:11px;overflow-x:auto;max-height:240px;overflow-y:auto;white-space:pre-wrap;color:var(--lb-text);">${escapeHtml(m.code || '')}</pre>
</div>
`).join('');
parts.push(renderSection(`VBA Source (${modules.length} module${modules.length !== 1 ? 's' : ''})`, body, { collapsible: true, summary: `${modules.length} module(s) -- click to view source code` }));
}
macroInfo.innerHTML = parts.join('');
}
// -- HTML smuggling rendering --------------------------------------
//
// Surfaces every non-empty piece of the `html_smuggle_info` structure
// produced by app/utils/htmlsmuggle.py:
// * Status pill: SMUGGLING / SUSPICIOUS / CLEAN with score
// * Detection notes (one-line summaries)
// * Score bar + matched-categories pill row
// * Matched patterns (table: name + category + weight)
// * Surface features (table: feature + value)
// * IOCs (download filenames, dataset blobs, largest base64 preview)
//
// Reuses the renderTable / renderSection / escapeHtml helpers defined
// for the office macro renderer.
function smuggleSeverityClass(h) {
if (h.is_smuggling) return 'critical';
if ((h.score || 0) > 0) return 'medium';
return 'low';
}
function renderHtmlSmuggleInfo(h) {
// Status pill
const sev = smuggleSeverityClass(h);
const sevClassMap = {
critical: 'bg-red-500/8 text-red-300 border border-red-500/22',
medium: 'bg-yellow-500/8 text-yellow-300 border border-yellow-500/22',
low: 'bg-green-500/8 text-green-300 border border-green-500/22',
};
if (elements.smuggleStatus) {
elements.smuggleStatus.className = `px-3 py-1 text-sm rounded-full ${sevClassMap[sev]}`;
const label = h.is_smuggling
? `SMUGGLING (score ${h.score}/${h.threshold})`
: (h.score > 0 ? `SUSPICIOUS (score ${h.score}/${h.threshold})` : 'CLEAN');
elements.smuggleStatus.textContent = label;
}
// Detection notes
const notes = h.detection_notes || [];
if (elements.smuggleDetectionNotes) {
elements.smuggleDetectionNotes.innerHTML = notes.map(note => `
<div style="display:flex;align-items:flex-start;gap:6px;margin-bottom:3px;">
<span style="color:var(--lb-warn);">⚠</span>
<span>${escapeHtml(note)}</span>
</div>
`).join('');
}
// Detail blocks
const host = elements.smuggleInfo;
if (!host) return;
const parts = [];
// Score line + matched-category pills
const cats = h.matched_categories || {};
if (Object.keys(cats).length > 0) {
const pills = Object.entries(cats).map(([cat, count]) =>
`<span class="lb-tag medium" style="margin-right:4px;">${escapeHtml(cat)} × ${count}</span>`
).join(' ');
parts.push(renderSection('Pattern Categories', `<div style="padding:4px 0;">${pills}</div>`));
}
// Matched patterns -- the actual signatures that fired
const matches = h.matched_patterns || [];
if (matches.length > 0) {
const rows = matches.map(m => [
`<span class="lb-mono" style="font-size:11px;">${escapeHtml(m.name)}</span>`,
`<span class="lb-tag info">${escapeHtml(m.category || '?')}</span>`,
`<span class="lb-mono" style="font-size:11px;">+${m.weight || 0}</span>`,
]);
parts.push(renderSection(`Matched Patterns (${matches.length})`, renderTable(['Pattern', 'Category', 'Weight'], rows)));
}
// Surface features
const f = h.features || {};
if (Object.keys(f).length > 0) {
const featureRows = [
['File size (bytes)', f.file_size],
['Script tags', f.script_tags],
['iframe tags', f.iframe_tags],
['embed tags', f.embed_tags],
['Base64 blob count (>=50 chars)', f.base64_blob_count],
['Largest base64 blob (chars)', f.largest_base64_chars],
['Has blob()', f.has_blob],
['Has atob()', f.has_atob],
['Has Uint8Array', f.has_uint8array],
['Has URL.createObjectURL', f.has_createobjecturl],
['Has <a download="...">', f.has_download_attr],
['Has String.fromCharCode', f.has_fromcharcode],
].filter(([, v]) => v !== undefined && v !== null && v !== false && v !== 0)
.map(([label, v]) => [
`<span style="font-size:12px;">${escapeHtml(label)}</span>`,
`<span class="lb-mono" style="font-size:12px;">${escapeHtml(String(v))}</span>`,
]);
if (featureRows.length > 0) {
parts.push(renderSection('Surface Features', renderTable(['Feature', 'Value'], featureRows)));
}
}
// IOCs
const iocs = h.iocs || {};
const iocBits = [];
if ((iocs.download_filenames || []).length > 0) {
const rows = iocs.download_filenames.map(name => [
`<span class="lb-tag medium">download=</span>`,
`<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(name)}</span>`,
]);
iocBits.push(renderTable(['Type', 'Value'], rows));
}
if ((iocs.data_file_attrs || []).length > 0) {
const rows = iocs.data_file_attrs.map(d => [
`<span class="lb-tag medium">data-file=</span>`,
`<span class="lb-mono" style="word-break:break-all;font-size:11px;">${escapeHtml(d)}</span>`,
]);
iocBits.push(renderTable(['Type', 'Value (truncated)'], rows));
}
if (iocs.largest_base64_blob && iocs.largest_base64_blob.length > 0) {
const b = iocs.largest_base64_blob;
iocBits.push(`
<div class="lb-mono" style="font-size:11px;padding:4px 0;">
<div><strong>Largest base64 blob:</strong> ${b.length} chars</div>
<div style="margin-top:4px;color:var(--lb-text-dim);">First 120: <span style="color:var(--lb-text);word-break:break-all;">${escapeHtml(b.preview_first_120)}</span></div>
${b.preview_last_120 ? `<div style="margin-top:4px;color:var(--lb-text-dim);">Last 120: <span style="color:var(--lb-text);word-break:break-all;">${escapeHtml(b.preview_last_120)}</span></div>` : ''}
</div>
`);
}
if (iocBits.length > 0) {
parts.push(renderSection('IOCs', iocBits.join('')));
}
if (h.truncated) {
parts.push(`<div class="lb-muted" style="font-size:11px;margin-top:8px;">⚠ Scan was truncated -- file exceeds the 5 MiB cap.</div>`);
}
host.innerHTML = parts.join('');
}
function getRuntimeConfig(buildWith) {
+11 -1
View File
@@ -75,7 +75,7 @@
<svg fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M4 6h16M4 12h16M4 18h7"/><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M14 16l3 3 5-5"/></svg>
All
</button>
<button type="button" class="lb-tab" data-mode="static" data-family="regular">
<button type="button" class="lb-tab" data-mode="static" data-family="regular office html">
<svg fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"/></svg>
Static
</button>
@@ -363,6 +363,16 @@
<div id="macroInfo" style="font-size: 12px; color: var(--lb-text-dim);"></div>
</div>
<!-- HTML smuggling info -->
<div id="htmlSmuggleInfo" class="hidden" style="border: 1px solid var(--lb-border); padding: 12px; margin-bottom: 12px;">
<div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 8px;">
<span class="lb-eyebrow">HTML Smuggling Analysis</span>
<span id="smuggleStatus" class="lb-tag muted"></span>
</div>
<div id="smuggleDetectionNotes" style="font-size: 12px; color: var(--lb-text-dim); margin-bottom: 6px;"></div>
<div id="smuggleInfo" style="font-size: 12px; color: var(--lb-text-dim);"></div>
</div>
<!-- File-specific info -->
<div id="fileSpecificInfo" style="font-size: 13px; color: var(--lb-text-dim);"></div>
</div>
+6 -5
View File
@@ -8,8 +8,6 @@ Prefer importing directly from submodules in new code:
from .file_io import (
FileTypeDetector,
detect_file_type,
get_lnk_info,
get_office_info,
get_pe_info,
save_uploaded_file,
)
@@ -19,6 +17,9 @@ from .forensics import (
calculate_entropy,
get_security_analyzer,
)
from .htmlsmuggle import get_html_smuggle_info
from .lnk import get_lnk_info
from .office import get_office_info
from .json_helpers import (
extract_detection_counts,
format_hex,
@@ -41,7 +42,7 @@ __all__ = [
'allowed_file', 'calculate_entropy', 'calculate_risk', 'calculate_yara_risk',
'check_tool', 'detect_file_type', 'extract_detection_counts',
'find_file_by_hash', 'format_hex', 'format_size', 'generate_html_report',
'get_entropy_risk_level', 'get_lnk_info', 'get_office_info', 'get_pe_info',
'get_risk_level', 'get_security_analyzer', 'load_json_file',
'save_uploaded_file', 'validate_pid',
'get_entropy_risk_level', 'get_html_smuggle_info', 'get_lnk_info',
'get_office_info', 'get_pe_info', 'get_risk_level',
'get_security_analyzer', 'load_json_file', 'save_uploaded_file', 'validate_pid',
]
+40 -20
View File
@@ -1,5 +1,15 @@
# app/utils/file_io.py
"""File ingestion: type detection, PE/Office/LNK metadata, upload handling."""
"""File ingestion: type detection, upload handling.
Per-file-type inspectors (PE / Office / LNK / HTML-smuggling) are dispatched
from `save_uploaded_file` based on the detected family. Each inspector lives
in its own module:
* PE -- get_pe_info (this module, uses forensics.SecurityAnalyzer)
* Office -- utils/office.py (get_office_info)
* LNK -- utils/lnk.py (get_lnk_info)
* HTML smuggling -- utils/htmlsmuggle.py (get_html_smuggle_info)
"""
import datetime
import hashlib
import json
@@ -11,8 +21,10 @@ import struct
import pefile
from werkzeug.utils import secure_filename
from ..analyzers.static.lnk_parser import LnkForensics
from .forensics import calculate_entropy, get_security_analyzer
from .htmlsmuggle import get_html_smuggle_info
from .lnk import get_lnk_info
from .office import get_office_info
from .risk_analyzer import RiskCalculator
@@ -42,6 +54,12 @@ class FileTypeDetector:
elif header.startswith(cls.LNK_HEADER):
return cls._detect_lnk_type(filepath)
# HTML / HTM detection -- file-extension based since HTML has no
# consistent magic. Cheap to check after the binary-header tests
# already missed.
if p.suffix.lower() in ('.html', '.htm'):
return {"family": "html", "type": p.suffix.lower().lstrip('.')}
return {"family": "unknown", "type": "unknown"}
except Exception as e:
@@ -142,8 +160,20 @@ class FileTypeDetector:
"visio/document.xml": "vsdx",
}
# Flag macro-enabled OOXML by presence of vbaProject.bin --
# promotes docx/xlsx/pptx -> docm/xlsm/pptm so the dashboard
# Type field reflects what's actually in the container.
has_vba = any(n.endswith("vbaproject.bin") for n in names)
macro_enabled_map = {
"docx": "docm",
"xlsx": "xlsm",
"pptx": "pptm",
}
for path, file_type in ooxml_types.items():
if path in names:
if has_vba and file_type in macro_enabled_map:
file_type = macro_enabled_map[file_type]
return {"family": "office", "type": file_type}
return {"family": "office", "type": "ooxml-unknown"}
@@ -331,24 +361,9 @@ def _build_pe_detection_notes(is_valid_checksum, suspicious_imports,
return detection_notes
def get_office_info(filepath, malapi_path):
"""Analyze Office macros (delegates to SecurityAnalyzer)."""
return get_security_analyzer(malapi_path).analyze_office_macros(filepath)
def get_lnk_info(filepath):
"""Analyze a Windows .LNK shortcut for forensic data."""
try:
lnk = LnkForensics(filepath)
if not lnk.is_valid():
return {'lnk_info': None}
forensic_data = lnk.get_forensic_data()
return {'lnk_info': forensic_data}
except Exception as e:
print(f"Error analyzing LNK file: {e}")
return {'lnk_info': None}
# Office / LNK / HTML-smuggling inspectors live in their own modules
# (imported at the top of this file). PE inspection stays here because it's
# tightly coupled to the SecurityAnalyzer cache (MalAPI lookup + entropy).
def _build_entropy_analysis(entropy_value):
@@ -456,6 +471,11 @@ def save_uploaded_file(file, config):
if 'error' not in lnk_result:
file_info.update(lnk_result)
elif file_type_info['family'] == 'html':
# Always update -- get_html_smuggle_info returns a usable dict even
# for clean files (just with is_smuggling=false / score=0).
file_info.update(get_html_smuggle_info(filepath))
with open(os.path.join(result_folder, filename, 'file_info.json'), 'w') as f:
json.dump(file_info, f)
+7 -43
View File
@@ -1,11 +1,15 @@
# app/utils/forensics.py
"""PE/Office forensic analysis: entropy, runtime detection, MalAPI lookup."""
"""PE forensic analysis: entropy, runtime detection, MalAPI lookup.
Office / LNK / HTML-smuggling analyzers live in their own modules
(`utils/office.py`, `utils/lnk.py`, `utils/htmlsmuggle.py`) so each file-type
inspector is self-contained and easy to maintain. This module is now strictly
PE-focused.
"""
import json
import math
from collections import Counter
from oletools.olevba import VBA_Parser
# Known runtime imports for compiled languages — used to flag PE imports as
# benign-runtime rather than suspicious.
@@ -251,46 +255,6 @@ class SecurityAnalyzer:
return sections_info
def analyze_office_macros(self, filepath):
"""Inspect Office VBA macros for suspicious patterns."""
try:
vbaparser = VBA_Parser(filepath)
detection_notes = []
info = {
'file_type': 'Microsoft Office Document',
'has_macros': vbaparser.detect_vba_macros(),
'macro_info': None,
'detection_notes': detection_notes,
}
if vbaparser.detect_vba_macros():
macro_analysis = vbaparser.analyze_macros()
info['macro_info'] = macro_analysis
macro_text = str(macro_analysis).lower()
detection_patterns = {
'shell': 'Shell command execution detected',
'wscript': 'WScript execution detected',
'powershell': 'PowerShell execution detected',
'http': 'Network communication detected',
'auto': 'Auto-execution mechanism detected',
'document_open': 'Document open auto-execution',
'windowshide': 'Hidden window execution',
'createobject': 'COM object creation detected',
}
for pattern, note in detection_patterns.items():
if pattern in macro_text:
detection_notes.append(note)
vbaparser.close()
return {'office_info': info}
except Exception as e:
print(f"Error analyzing Office file: {e}")
return {'office_info': None}
_security_analyzer_cache = {}
+282
View File
@@ -0,0 +1,282 @@
# app/utils/htmlsmuggle.py
"""HTML-smuggling pattern scanner.
Runs at upload time on `.html` / `.htm` files (alongside `get_pe_info`,
`get_office_info`, `get_lnk_info`). Output lands in `file_info.html_smuggle_info`
and is rendered on the upload-result page the same way office_info is.
Pattern set + scoring model ported from SmuggleShield's `content.js`
(https://github.com/RootUp/SmuggleShield). The browser extension catches
runtime behaviour (DOM mutation, blob URL revoke, programmatic <a download>
click); we catch the file-on-disk equivalent by regex-scanning the raw
HTML source.
Scoring (mirrors SmuggleShield):
- Each pattern carries a weight (2-4).
- High-weight (>=3) patterns scanned first; early-return when the score
crosses the threshold.
- Low-weight (<3) patterns scanned only when high-weight pass landed
within `threshold - 2` of crossing.
- A cheap pre-filter (`atob | blob | base64 | createobjecturl | ...`)
skips files that obviously aren't smuggling.
"""
import os
import re
from typing import Dict, List
# (weight, pattern, name, category)
_PATTERNS = [
# --- Direct base64 -> binary -> blob path ----------------------------
(3, r'atob\s*\([^)]+\).*new\s+uint8array', 'atob_to_uint8array', 'encoding'),
(3, r"atob\s*\(\s*['\"]([A-Za-z0-9+/=]{100,})['\"].*\)", 'large_base64_atob', 'encoding'),
(3, r'new\s+blob\s*\(\s*\[\s*(?:data|atob\s*\()', 'blob_from_atob_data', 'blob'),
(4, r"let\s+arrayBuffer\s*=\s*\['0x[0-9a-f]{2}'(?:\s*,\s*'0x[0-9a-f]{2}')+\]", 'hex_array_buffer', 'encoding'),
# --- Reversed-string fromCharCode obfuscation ------------------------
(4, r'\["edoCrahCmorf"(?:\s*\[\s*"split"\s*\]\s*\(\s*""\s*\)\s*\[\s*"reverse"\s*\]\s*\(\s*\)\s*\[\s*"join"\s*\]\s*\(\s*""\s*\))', 'reversed_fromcharcode_obf', 'obfuscation'),
(4, r'setTimeout\s*\(\s*\[.*?\]\.map\s*\(\s*.*?=>.*?(?:fromCharCode|edoCrahCmorf).*?\/\s*\d+\s*\)', 'settimeout_fromcharcode', 'obfuscation'),
(3, r'String\s*\[\s*(?:"edoCrahCmorf"|[\'"][^\'\"]+[\'"]\.split\([\'"][\'"]\)\.reverse\(\)\.join\([\'"][\'"]\))\s*\]', 'string_reverse_index', 'obfuscation'),
# --- Blob -> object URL -> download chain ----------------------------
(3, r'url\.createobjecturl\s*\(\s*(?:my)?blob\s*\)', 'createobjecturl_from_blob', 'blob'),
(3, r'location(?:\s*\[\s*[\'"]href[\'"]\s*\])?\s*=\s*url', 'location_href_assign', 'writer'),
(2, r'url\.revokeobjecturl\s*\(\s*url\s*\)', 'revokeobjecturl', 'blob'),
(3, r'\.style\s*=\s*[\'"]display:\s*none[\'"].*\.href\s*=.*\.download\s*=', 'hidden_anchor_download', 'writer'),
(3, r'\.click\s*\(\s*\).*url\.revokeobjecturl', 'auto_click_then_revoke', 'writer'),
(3, r'href\s*=\s*["\']data:(?:application/octet-stream|image/svg\+xml);base64,', 'data_url_octet_stream', 'writer'),
# --- Bracket-string property access (window["a"+"to"+"b"] etc.) ------
(3, r'window\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]', 'window_bracket_concat', 'obfuscation'),
(4, r'document\s*\[\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\]\s*\(\s*window\s*\[\s*(?:[\'"]at[\'"].*[\'"]o[\'"].*[\'"]b[\'"]\s*\]|\s*(?:["\']\w+["\']\s*\+\s*)+["\']\w+["\']\s*\])\s*\([\'"][A-Za-z0-9+/=]+[\'"]\)\s*\)', 'document_bracket_atob', 'obfuscation'),
(4, r'var\s+\w+=\w+;?\s*\(function\(\w+,\w+\)\{.*while\(!!\[\]\)\{try\{.*parseint.*\}catch\(\w+\)\{.*\}\}\(.*\)\);?', 'parseint_obfuscator', 'obfuscation'),
# --- Blob mime-type signatures + writer chain ------------------------
(3, r'blob\s*\(\s*\[[^\]]+\]\s*,\s*\{\s*type\s*:\s*[\'"](?:application/octet-stream|text/html|octet/stream)[\'"](?:\s*,\s*encoding\s*:\s*[\'"]base64[\'"])?\s*\}\s*\)', 'blob_with_octet_type', 'blob'),
# --- WebAssembly / Go runtime smuggling ------------------------------
(3, r'webassembly\s*\.\s*(?:instantiate(?:streaming)?|instance)', 'webassembly_instantiate', 'wasm'),
(2, r'navigator\.serviceworker\.register', 'service_worker_register', 'wasm'),
(2, r'wasm[_-]?exec\.js', 'wasm_exec_js', 'wasm'),
(3, r'\.wasm\b', 'wasm_extension_ref', 'wasm'),
(3, r'new\s+go\s*\(\s*\)', 'go_runtime_new', 'wasm'),
(3, r'go\s*\.\s*run\s*\(', 'go_runtime_run', 'wasm'),
# --- Embedded srcdoc / iframe + script -------------------------------
(3, r'srcdoc\s*=\s*["\'][^"\']*<script', 'srcdoc_with_script', 'writer'),
(3, r'<embed[^>]*base64', 'embed_with_base64', 'writer'),
# --- Decoder helpers + legacy IE save -------------------------------
(3, r'function\s+(?:b64toarray|xor|base64toarraybuffer)\s*\([^)]*\)\s*\{[\s\S]*?return\s+(?:bytes\.buffer|result);?\}', 'decoder_helper_func', 'encoding'),
(3, r'document\.createelement\([\'"]embed[\'"]\)', 'createelement_embed', 'writer'),
(2, r'\.setattribute\([\'"]src[\'"]\s*,\s*.*\)', 'setattribute_src', 'writer'),
(3, r'window\.navigator\.mssaveoropenblob\s*\(\s*blob\s*,\s*filename\s*\)', 'mssaveoropenblob', 'writer'),
(2, r'(?:window\.)?url\.createobjecturl\s*\(\s*(?:blob|[^)]+)\s*\)', 'generic_createobjecturl', 'blob'),
(2, r'(?:a|element)\.download\s*=\s*(?:filename|[\'"][^\'"]+[\'"])', 'anchor_download_attr', 'writer'),
(2, r'string\.fromcharcode\(.*\)', 'string_fromcharcode', 'encoding'),
(2, r'\.charcodeat\(.*\)', 'charcodeat', 'encoding'),
(3, r'document\.getelementbyid\([\'"]passwordid[\'"]\)\.value', 'password_field_lookup', 'writer'),
(3, r'import\s*\(\s*url\.createobjecturl\s*\(', 'dynamic_import_objurl', 'wasm'),
(3, r'\w+\s*\(\s*\w+\s*\(\s*[\'"][A-Za-z0-9+/=]{50,}[\'"]\s*\)\s*\)', 'nested_call_long_b64', 'encoding'),
(2, r'(?:window\.)?atob\s*\(', 'atob_call', 'encoding'),
(2, r'uint8[aA]rray\s*\(\s*(?:(?!len)[^)])*\)', 'uint8array_constructor', 'encoding'),
(3, r'mssaveoropenblob|mssaveblob', 'mssave_alias', 'writer'),
(3, r'base64toarraybuffer', 'b64_to_arraybuffer_helper', 'encoding'),
(3, r'xmlhttprequest\(\).*\.responsetype\s*=\s*[\'"]arraybuffer[\'"]', 'xhr_arraybuffer_response', 'encoding'),
(3, r'new\s+dataview\(.*\).*\.getuint8\(.*\).*\.setuint8\(', 'dataview_getset_uint8', 'encoding'),
(2, r'[^\w](\w+)\s*=\s*(\w+)\s*\^\s*(\w+)', 'xor_operation', 'encoding'),
(2, r'\.slice\(\s*\w+\s*-\s*\d+\s*,\s*\w+\s*-\s*\d+\s*\)', 'string_slice_offset', 'obfuscation'),
(3, r'for\s*\([^)]+\)\s*\{[^}]*string\.fromcharcode\([^)]+\)', 'loop_fromcharcode', 'encoding'),
# --- GWT (Google Web Toolkit) smuggling artefacts --------------------
(4, r'\$wnd\s*=\s*window;\s*\$doc\s*=\s*\$wnd\.document', 'gwt_wnd_doc', 'gwt'),
(4, r'__gwt_(?:isKnownPropertyValue|getMetaProperty|marker|stylesLoaded|scriptsLoaded)', 'gwt_internals', 'gwt'),
(3, r'\$strongName\s*=\s*[\'"][0-9A-F]{32}[\'"]', 'gwt_strong_name', 'gwt'),
(3, r'\$gwt_version\s*=\s*[\'"][0-9.]+[\'"]', 'gwt_version', 'gwt'),
(4, r'(?:function|var)\s+[a-zA-Z$_]+\s*=\s*\{\s*[a-zA-Z$_]+:\s*window,\s*[a-zA-Z$_]+:\s*document\s*\}', 'gwt_window_doc_pair', 'gwt'),
(3, r'\b(?:gwtOnLoad|__gwtStatsEvent|gwtOnLoadFunc)\b', 'gwt_onload', 'gwt'),
(3, r'\.setAttribute\([\'"]__gwt_property[\'"]', 'gwt_property_attr', 'gwt'),
(4, r'document\.createElement\([\'"]script[\'"]\).*?\.src\s*=.*?\.cache\.js', 'gwt_cache_js', 'gwt'),
# --- Mouse/event-triggered drop chains -------------------------------
(4, r'(?:document|window)\.on(?:mousemove|load|mouseover)\s*=\s*function\s*\(\s*\)\s*\{[^}]*?data:application/[^}]*?\.click\(\)[^}]*?(?:removeChild|remove)\(', 'mouse_event_drop', 'writer'),
(4, r'(?:window|var|let)\.\w+Triggered\s*=\s*(?:true|false).*?(?:navigator|platform).*?data:application/[^;]+;base64,.*?\.(?:download|click)', 'triggered_flag_drop', 'writer'),
(4, r'navigator\[?["\']platform["\']\]?.*?(?:document|window)\.on\w+.*?data:application/', 'platform_event_drop', 'writer'),
# --- Generic split/concat/reverse obfuscation ------------------------
(3, r'\[[\'"][^\'\"]+[\'"]\s*\+\s*[\'"][^\'\"]+[\'"]\]', 'string_concat_index', 'obfuscation'),
(3, r"\[\'[a-z]+\'\s*\+\s*\'[a-z]+\'\]", 'concat_lower_index', 'obfuscation'),
(3, r"\[\s*(?:[\'\"]\w?[\'\"](?:\s*,\s*)?){4,}\s*\]\.join\s*\(\s*[\'\"]*\s*\)", 'array_join_join', 'obfuscation'),
(3, r'const\s+\w+\s*=\s*\[\s*(?:[\'"]\w?[\'"](?:\s*,\s*)?){4,}', 'const_char_array', 'obfuscation'),
(4, r'(\[(?:\][^(]*|\[\])[^(]*|\w+\.)constructor\s*\(\s*([\'"])return\s*\w+\2\s*\)', 'constructor_return', 'obfuscation'),
(4, r'Function\s*\(\s*[\'"]return\s+\w+[\'"](?:\s*\)\s*\(\s*\)|\(\))', 'function_return', 'obfuscation'),
(3, r'\w+\.split\s*\(\s*[\'"][\'\"]?\s*\)\.reverse\s*\(\s*\)\.join\s*\(', 'split_reverse_join', 'obfuscation'),
(3, r'\[\s*\w+\.split\s*\(\s*[\'"][\'"]\s*\)\.reverse\s*\(\s*\)', 'array_split_reverse', 'obfuscation'),
(3, r'setTimeout\s*\(\s*(?:function|\(\)|[^,]+)\s*(?:=>)?\s*\{[\s\S]{10,}?setTimeout\s*\(', 'nested_settimeout', 'obfuscation'),
(4, r'setTimeout\s*\([^{)]*\{[^{}]*setTimeout\s*\([^{)]*\{[^{}]*\}', 'double_settimeout', 'obfuscation'),
(4, r'new\s*\([^)]*\[\s*(?:[\'"][^\'\"]+[\'"]\.split|[\'"]\w+[\'"]\.split)', 'new_with_split_index', 'obfuscation'),
(3, r'\[[^\]]*(?:join|reverse)[^\]]*\]\s*\(\s*(?:\w+|[\'"][^\'"]*[\'"])\s*\)', 'index_join_reverse', 'obfuscation'),
(3, r'\[\s*(?:urlMethod|parts\.join\(\)|[\'"]\w+[\'"]\s*\+)', 'partsjoin_index', 'obfuscation'),
(4, r'\w+\s*\[\s*(?:[\'"][^\'\"]+[\'"](?:\s*\+\s*)?)+\s*\]\s*\(\s*\w+\s*\)', 'concat_call', 'obfuscation'),
# --- "down" + "load" decomposition (extremely common) ----------------
(3, r'[\'"]?down[\'"]?\s*\+\s*[\'"]?load[\'"]?', 'down_plus_load', 'obfuscation'),
(4, r"\['down' \+ 'load'\]", 'down_load_bracket_exact', 'obfuscation'),
(4, r'createElement\s*\(\s*[\'"]a[\'"]\s*\)[^}]*?\[\s*[\'"]\w+[\'"]\s*\+\s*[\'"]\w+[\'"]\s*\]', 'createanchor_concat_attr', 'writer'),
(3, r"\['style'\]\['visi' \+ 'bility'\]", 'visibility_concat', 'obfuscation'),
# --- Chunked-substr + dataset-based payload chains -------------------
(3, r'function\s+\w+Chunks\s*\([^)]*\)\s*\{[^{}]*for\s*\([^{}]*\)\s*\{[^{}]*substr', 'chunk_substr_loop', 'encoding'),
(2, r'\.substr\s*\(\s*\w+\s*,\s*\w+Size\s*\)', 'substr_size_param', 'encoding'),
(4, r'\(async\s*\(\s*\)\s*=>\s*\{\s*(?:let|var|const)\s+d\s*=.*?(?:document\.getElementById|document\.querySelector).*?dataset.*?\.href\s*=\s*d.*?\.download\s*=.*?\.click\s*\(\s*\)', 'async_dataset_click', 'writer'),
(4, r'\bdocument\.getElementById\s*\(\s*[\'"]data[\'"]\s*\).*?\.dataset\.file.*?createElement\s*\(\s*[\'"]a[\'"]\s*\).*?\.download\s*=', 'data_div_dataset_anchor', 'writer'),
(3, r'<div[^>]*id\s*=\s*["\']data["\'][^>]*data-file\s*=\s*["\'][A-Za-z0-9+/=]{50,}["\'][^>]*>', 'data_div_with_b64', 'writer'),
(4, r'<script>\s*\(\s*async\s*\(\s*\)\s*=>\s*\{[^}]*createElement\s*\(\s*[\'"]a[\'"]\s*\)[^}]*\.click\s*\(\s*\)[^}]*\.remove\s*\(\s*\)', 'inline_async_click_remove', 'writer'),
(4, r'\b(?:atob|decodeURIComponent)\s*\([^)]*(?:dataset|getAttribute)\s*\.[^)]*\)[^;]*\.href\s*=[^;]*\.download\s*=[^;]*\.click\s*\(\s*\)', 'decode_dataset_click', 'writer'),
(4, r'\bdocument\.body\.appendChild\s*\([^)]+\)[^;]*\.click\s*\(\s*\)[^;]*\.remove\s*\(\s*\)', 'append_click_remove', 'writer'),
]
# Quick-reject filter -- skip the full regex pass on obviously-clean HTML.
_QUICK_CHECK = re.compile(
r'blob|atob|download|base64|arraybuffer|uint8array|createobjecturl|fromcharcode',
re.IGNORECASE,
)
_THRESHOLD = 4
_MAX_BYTES = 5 * 1024 * 1024 # 5 MiB cap on what we read for the scan
# Pre-compile patterns once at import time.
_RE_FLAGS = re.IGNORECASE | re.DOTALL
_COMPILED = [(w, re.compile(p, _RE_FLAGS), n, c) for w, p, n, c in _PATTERNS]
_HIGH = [t for t in _COMPILED if t[0] >= 3]
_LOW = [t for t in _COMPILED if t[0] < 3]
def get_html_smuggle_info(filepath: str) -> Dict:
"""Public entry. Returns `{html_smuggle_info: {...}}` or `{html_smuggle_info: None}`
on read error -- mirrors `get_office_info` / `get_lnk_info` shape so file_io can
do `file_info.update(result)` without conditionals."""
try:
size = os.path.getsize(filepath)
with open(filepath, 'rb') as f:
raw = f.read(_MAX_BYTES)
content = raw.decode('utf-8', errors='replace')
truncated = size > len(raw)
except OSError as e:
return {'html_smuggle_info': {'error': f'read failed: {e}'}}
features = _features(content)
iocs = _iocs(content)
if not _QUICK_CHECK.search(content):
return {'html_smuggle_info': _build(False, 0, [], features, iocs, truncated)}
score, matches = _scan(content, _HIGH, _THRESHOLD)
if score < _THRESHOLD and score >= max(0, _THRESHOLD - 2):
extra_score, extra_matches = _scan(content, _LOW, _THRESHOLD - score)
score += extra_score
matches += extra_matches
return {'html_smuggle_info': _build(score >= _THRESHOLD, score, matches, features, iocs, truncated)}
def _scan(content: str, patterns, max_score: int):
score = 0
matches: List[Dict] = []
for weight, rx, name, category in patterns:
if rx.search(content):
score += weight
matches.append({'name': name, 'category': category, 'weight': weight})
if score >= max_score:
break
return score, matches
def _features(content: str) -> Dict:
"""Surface-level counts -- mirror SmuggleShield's MLDetector feature set."""
base64_lengths = [
len(m.group(0))
for m in re.finditer(r'[A-Za-z0-9+/=]{50,}', content)
]
return {
'file_size': len(content),
'has_blob': bool(re.search(r'\bblob\s*\(', content, re.IGNORECASE)),
'has_atob': bool(re.search(r'\batob\s*\(', content, re.IGNORECASE)),
'has_uint8array': bool(re.search(r'\buint8array\b', content, re.IGNORECASE)),
'has_createobjecturl': bool(re.search(r'createobjecturl', content, re.IGNORECASE)),
'has_download_attr': bool(re.search(r'\bdownload\s*=\s*[\'"][^\'"]+[\'"]', content, re.IGNORECASE)),
'has_fromcharcode': bool(re.search(r'fromcharcode', content, re.IGNORECASE)),
'script_tags': len(re.findall(r'<script\b', content, re.IGNORECASE)),
'iframe_tags': len(re.findall(r'<iframe\b', content, re.IGNORECASE)),
'embed_tags': len(re.findall(r'<embed\b', content, re.IGNORECASE)),
'base64_blob_count': len(base64_lengths),
'largest_base64_chars': max(base64_lengths) if base64_lengths else 0,
}
def _iocs(content: str) -> Dict:
"""Pull operator-readable artifacts -- attempted download filenames,
the largest embedded base64 blob, dataset-based payload tags."""
download_names = list({
m.group(1)
for m in re.finditer(r'\bdownload\s*=\s*[\'"]([^\'"]{1,100})[\'"]', content, re.IGNORECASE)
})[:20]
largest_b64 = ''
for m in re.finditer(r'[A-Za-z0-9+/=]{200,}', content):
blob = m.group(0)
if len(blob) > len(largest_b64):
largest_b64 = blob
if len(largest_b64) > 50000:
break
data_file_attrs = list({
m.group(1)[:200]
for m in re.finditer(r'\bdata-file\s*=\s*[\'"]([A-Za-z0-9+/=]{20,})[\'"]', content, re.IGNORECASE)
})[:10]
return {
'download_filenames': download_names,
'data_file_attrs': data_file_attrs,
'largest_base64_blob': {
'length': len(largest_b64),
'preview_first_120': largest_b64[:120],
'preview_last_120': largest_b64[-120:] if len(largest_b64) > 120 else '',
} if largest_b64 else None,
}
def _build(is_smuggling: bool, score: int, matches, features, iocs, truncated: bool) -> Dict:
by_category: Dict[str, int] = {}
for m in matches:
by_category[m['category']] = by_category.get(m['category'], 0) + 1
notes: List[str] = []
if is_smuggling:
notes.append(
f"HTML smuggling detected -- pattern score {score} >= threshold {_THRESHOLD} "
f"({len(matches)} pattern{'s' if len(matches) != 1 else ''} fired)"
)
elif score > 0:
notes.append(f"Suspicious patterns present but below threshold ({score}/{_THRESHOLD})")
if features.get('largest_base64_chars', 0) >= 1000:
notes.append(
f"Large base64 blob present ({features['largest_base64_chars']} chars) "
f"-- typical of smuggled binary payload"
)
if features.get('has_download_attr') and features.get('has_blob'):
notes.append("Combination of <a download> + Blob -- classic smuggling-writer chain")
if features.get('has_atob') and features.get('has_uint8array'):
notes.append("atob() + Uint8Array decode chain present")
return {
'is_smuggling': is_smuggling,
'score': score,
'threshold': _THRESHOLD,
'matched_patterns': matches,
'matched_categories': by_category,
'features': features,
'iocs': iocs,
'truncated': truncated,
'detection_notes': notes,
}
+29
View File
@@ -0,0 +1,29 @@
# app/utils/lnk.py
"""Windows shortcut (.lnk) analyzer.
Runs at upload time on .lnk files (alongside `get_pe_info`, `get_office_info`,
`get_html_smuggle_info`). Output lands in `file_info.lnk_info`.
Heavy lifting is in `app.analyzers.static.lnk_parser.LnkForensics`; this
module is a thin wrapper that adapts the parser to the file_io drop-in
contract (returns `{lnk_info: {...}}` ready for `file_info.update(...)`).
"""
import logging
from typing import Dict
from ..analyzers.static.lnk_parser import LnkForensics
logger = logging.getLogger(__name__)
def get_lnk_info(filepath: str) -> Dict:
"""Public entry. Returns `{lnk_info: <dict or None>}`."""
try:
lnk = LnkForensics(filepath)
if not lnk.is_valid():
return {'lnk_info': None}
return {'lnk_info': lnk.get_forensic_data()}
except Exception as e:
logger.warning(f"LNK analysis failed on {filepath}: {e}")
return {'lnk_info': None}
+220
View File
@@ -0,0 +1,220 @@
# app/utils/office.py
"""Office document analyzer.
Runs at upload time on Word / Excel / RTF / legacy CFBF binaries (alongside
`get_pe_info`, `get_lnk_info`, `get_html_smuggle_info`). Output lands in
`file_info.office_info`.
Two analysis branches:
1. olevba -- VBA / XLM macros embedded in the file. Pulls per-module
source, autoexec triggers, suspicious keyword hits, IOCs.
2. OOXML rels inspection -- external `attachedTemplate` / `oleObject` /
`subDocument` / `frame` references. Catches T1221 (Remote Template
Injection) which is invisible to olevba because the malicious VBA
lives in a remote .dotm, not in the file itself. Atomic Red Team's
`Calculator.docx` is the canonical example.
"""
import logging
import xml.etree.ElementTree as ET
import zipfile
from typing import Dict, List
from oletools.olevba import VBA_Parser
logger = logging.getLogger(__name__)
# Relationship Types we care about when they target an external (HTTP/UNC)
# resource. `attachedTemplate` is the T1221 vector. The others pull remote
# content the same way; less common but the same class of risk.
_INTERESTING_RELS = (
'attachedTemplate',
'oleObject',
'subDocument',
'frame',
'image', # rare but seen in malicious docs that fetch tracking pixels
'hyperlink',
)
def get_office_info(filepath: str, malapi_path=None) -> Dict:
"""Public entry. Returns `{office_info: {...}}` -- mirrors `get_lnk_info` /
`get_html_smuggle_info` shape so file_io can do `file_info.update(result)`
without conditionals.
`malapi_path` is accepted for back-compat with the old SecurityAnalyzer
delegation but isn't used here -- the office analyzer doesn't need
MalAPI lookups.
"""
info = {
'file_type': 'Microsoft Office Document',
'has_macros': False,
'modules': [], # [{stream, vba_filename, code}]
'analysis': {
'autoexec': [], # [{keyword, description}] auto-execution triggers
'suspicious': [], # [{keyword, description}] suspicious keyword hits
'iocs': [], # [{type, value}] extracted URLs / IPs / EXEs / etc.
'hex_strings': [],
'base64_strings': [],
'vba_strings': [],
},
'external_refs': [], # external relationship targets (T1221 etc.)
'detection_notes': [],
}
_run_olevba(filepath, info)
_run_external_rels(filepath, info)
return {'office_info': info}
def _run_olevba(filepath: str, info: Dict) -> None:
"""Branch 1 -- VBA / XLM macro analysis via oletools.olevba."""
try:
vbaparser = VBA_Parser(filepath)
except Exception as e:
logger.warning(f"olevba init failed on {filepath}: {e}")
return
try:
if not vbaparser.detect_vba_macros():
return
info['has_macros'] = True
# Per-module source code: (filename, stream_path, vba_filename, vba_code)
for _, stream, vba_fname, vba_code in vbaparser.extract_macros():
if vba_code:
info['modules'].append({
'stream': stream,
'vba_filename': vba_fname,
'code': vba_code,
})
# Structured analysis -- olevba returns (kw_type, keyword, description)
for kw_type, keyword, description in vbaparser.analyze_macros():
kt = (kw_type or '').lower()
entry = {'keyword': keyword, 'description': description}
if kt == 'autoexec':
info['analysis']['autoexec'].append(entry)
elif kt == 'suspicious':
info['analysis']['suspicious'].append(entry)
elif kt == 'iocs':
info['analysis']['iocs'].append({'type': keyword, 'value': description})
elif kt == 'hex string':
info['analysis']['hex_strings'].append(entry)
elif kt == 'base64 string':
info['analysis']['base64_strings'].append(entry)
elif kt in ('vba string', 'vba_string'):
info['analysis']['vba_strings'].append(entry)
a = info['analysis']
if a['autoexec']:
info['detection_notes'].append(
f"{len(a['autoexec'])} auto-execution trigger"
f"{'s' if len(a['autoexec']) != 1 else ''} detected"
)
if a['suspicious']:
info['detection_notes'].append(
f"{len(a['suspicious'])} suspicious keyword"
f"{'s' if len(a['suspicious']) != 1 else ''} in macro body"
)
if a['iocs']:
info['detection_notes'].append(
f"{len(a['iocs'])} IOC"
f"{'s' if len(a['iocs']) != 1 else ''} extracted from macro"
)
except Exception as e:
logger.warning(f"olevba analysis failed on {filepath}: {e}")
finally:
try:
vbaparser.close()
except Exception:
pass
def _run_external_rels(filepath: str, info: Dict) -> None:
"""Branch 2 -- T1221 / external-relationship inspection."""
try:
external = _scan_external_relationships(filepath)
except Exception as e:
logger.warning(f"External-rels scan failed on {filepath}: {e}")
return
if not external:
return
info['external_refs'] = external
t1221 = [r for r in external if r['relationship'] == 'attachedTemplate']
if t1221:
info['detection_notes'].append(
f"MITRE T1221: Remote Template Injection -- {len(t1221)} "
f"external `attachedTemplate` reference"
f"{'s' if len(t1221) != 1 else ''}. "
f"Malicious VBA likely lives in the remote target, not in this file."
)
ole_remote = [r for r in external if r['relationship'] == 'oleObject']
if ole_remote:
info['detection_notes'].append(
f"{len(ole_remote)} external OLE-object reference"
f"{'s' if len(ole_remote) != 1 else ''} -- remote-fetched embedded payload"
)
subdoc = [r for r in external if r['relationship'] == 'subDocument']
if subdoc:
info['detection_notes'].append(
f"{len(subdoc)} external subDocument reference"
f"{'s' if len(subdoc) != 1 else ''}"
)
def _scan_external_relationships(filepath: str) -> List[Dict]:
"""Walk every `*.rels` file inside an OOXML container and return the list
of relationships whose `TargetMode` is `External` AND whose Type is one
of `_INTERESTING_RELS`. Returns `[]` for non-zip files (legacy CFBF
.doc/.xls binaries).
"""
if not zipfile.is_zipfile(filepath):
return []
findings: List[Dict] = []
try:
with zipfile.ZipFile(filepath) as z:
rels_files = [n for n in z.namelist() if n.endswith('.rels')]
for rels_name in rels_files:
try:
data = z.read(rels_name)
except Exception:
continue
try:
root = ET.fromstring(data)
except ET.ParseError:
continue
for rel in root.iter():
tag = rel.tag.rsplit('}', 1)[-1] if '}' in rel.tag else rel.tag
if tag != 'Relationship':
continue
if rel.attrib.get('TargetMode', '').lower() != 'external':
continue
rel_type = rel.attrib.get('Type', '')
target = rel.attrib.get('Target', '')
rel_name = rel_type.rsplit('/', 1)[-1] if '/' in rel_type else rel_type
if rel_name not in _INTERESTING_RELS:
continue
findings.append({
'rels_file': rels_name,
'relationship': rel_name,
'target': target,
'target_mode': 'External',
'full_type': rel_type,
})
except zipfile.BadZipFile:
pass
return findings