221 lines
8.0 KiB
Python
221 lines
8.0 KiB
Python
# app/utils/office.py
|
|
"""Office document analyzer.
|
|
|
|
Runs at upload time on Word / Excel / RTF / legacy CFBF binaries (alongside
|
|
`get_pe_info`, `get_lnk_info`, `get_html_smuggle_info`). Output lands in
|
|
`file_info.office_info`.
|
|
|
|
Two analysis branches:
|
|
|
|
1. olevba -- VBA / XLM macros embedded in the file. Pulls per-module
|
|
source, autoexec triggers, suspicious keyword hits, IOCs.
|
|
|
|
2. OOXML rels inspection -- external `attachedTemplate` / `oleObject` /
|
|
`subDocument` / `frame` references. Catches T1221 (Remote Template
|
|
Injection) which is invisible to olevba because the malicious VBA
|
|
lives in a remote .dotm, not in the file itself. Atomic Red Team's
|
|
`Calculator.docx` is the canonical example.
|
|
"""
|
|
|
|
import logging
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from typing import Dict, List
|
|
|
|
from oletools.olevba import VBA_Parser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Relationship Types we care about when they target an external (HTTP/UNC)
|
|
# resource. `attachedTemplate` is the T1221 vector. The others pull remote
|
|
# content the same way; less common but the same class of risk.
|
|
_INTERESTING_RELS = (
|
|
'attachedTemplate',
|
|
'oleObject',
|
|
'subDocument',
|
|
'frame',
|
|
'image', # rare but seen in malicious docs that fetch tracking pixels
|
|
'hyperlink',
|
|
)
|
|
|
|
|
|
def get_office_info(filepath: str, malapi_path=None) -> Dict:
|
|
"""Public entry. Returns `{office_info: {...}}` -- mirrors `get_lnk_info` /
|
|
`get_html_smuggle_info` shape so file_io can do `file_info.update(result)`
|
|
without conditionals.
|
|
|
|
`malapi_path` is accepted for back-compat with the old SecurityAnalyzer
|
|
delegation but isn't used here -- the office analyzer doesn't need
|
|
MalAPI lookups.
|
|
"""
|
|
info = {
|
|
'file_type': 'Microsoft Office Document',
|
|
'has_macros': False,
|
|
'modules': [], # [{stream, vba_filename, code}]
|
|
'analysis': {
|
|
'autoexec': [], # [{keyword, description}] auto-execution triggers
|
|
'suspicious': [], # [{keyword, description}] suspicious keyword hits
|
|
'iocs': [], # [{type, value}] extracted URLs / IPs / EXEs / etc.
|
|
'hex_strings': [],
|
|
'base64_strings': [],
|
|
'vba_strings': [],
|
|
},
|
|
'external_refs': [], # external relationship targets (T1221 etc.)
|
|
'detection_notes': [],
|
|
}
|
|
|
|
_run_olevba(filepath, info)
|
|
_run_external_rels(filepath, info)
|
|
|
|
return {'office_info': info}
|
|
|
|
|
|
def _run_olevba(filepath: str, info: Dict) -> None:
|
|
"""Branch 1 -- VBA / XLM macro analysis via oletools.olevba."""
|
|
try:
|
|
vbaparser = VBA_Parser(filepath)
|
|
except Exception as e:
|
|
logger.warning(f"olevba init failed on {filepath}: {e}")
|
|
return
|
|
|
|
try:
|
|
if not vbaparser.detect_vba_macros():
|
|
return
|
|
|
|
info['has_macros'] = True
|
|
|
|
# Per-module source code: (filename, stream_path, vba_filename, vba_code)
|
|
for _, stream, vba_fname, vba_code in vbaparser.extract_macros():
|
|
if vba_code:
|
|
info['modules'].append({
|
|
'stream': stream,
|
|
'vba_filename': vba_fname,
|
|
'code': vba_code,
|
|
})
|
|
|
|
# Structured analysis -- olevba returns (kw_type, keyword, description)
|
|
for kw_type, keyword, description in vbaparser.analyze_macros():
|
|
kt = (kw_type or '').lower()
|
|
entry = {'keyword': keyword, 'description': description}
|
|
if kt == 'autoexec':
|
|
info['analysis']['autoexec'].append(entry)
|
|
elif kt == 'suspicious':
|
|
info['analysis']['suspicious'].append(entry)
|
|
elif kt == 'iocs':
|
|
info['analysis']['iocs'].append({'type': keyword, 'value': description})
|
|
elif kt == 'hex string':
|
|
info['analysis']['hex_strings'].append(entry)
|
|
elif kt == 'base64 string':
|
|
info['analysis']['base64_strings'].append(entry)
|
|
elif kt in ('vba string', 'vba_string'):
|
|
info['analysis']['vba_strings'].append(entry)
|
|
|
|
a = info['analysis']
|
|
if a['autoexec']:
|
|
info['detection_notes'].append(
|
|
f"{len(a['autoexec'])} auto-execution trigger"
|
|
f"{'s' if len(a['autoexec']) != 1 else ''} detected"
|
|
)
|
|
if a['suspicious']:
|
|
info['detection_notes'].append(
|
|
f"{len(a['suspicious'])} suspicious keyword"
|
|
f"{'s' if len(a['suspicious']) != 1 else ''} in macro body"
|
|
)
|
|
if a['iocs']:
|
|
info['detection_notes'].append(
|
|
f"{len(a['iocs'])} IOC"
|
|
f"{'s' if len(a['iocs']) != 1 else ''} extracted from macro"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"olevba analysis failed on {filepath}: {e}")
|
|
finally:
|
|
try:
|
|
vbaparser.close()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _run_external_rels(filepath: str, info: Dict) -> None:
|
|
"""Branch 2 -- T1221 / external-relationship inspection."""
|
|
try:
|
|
external = _scan_external_relationships(filepath)
|
|
except Exception as e:
|
|
logger.warning(f"External-rels scan failed on {filepath}: {e}")
|
|
return
|
|
|
|
if not external:
|
|
return
|
|
|
|
info['external_refs'] = external
|
|
|
|
t1221 = [r for r in external if r['relationship'] == 'attachedTemplate']
|
|
if t1221:
|
|
info['detection_notes'].append(
|
|
f"MITRE T1221: Remote Template Injection -- {len(t1221)} "
|
|
f"external `attachedTemplate` reference"
|
|
f"{'s' if len(t1221) != 1 else ''}. "
|
|
f"Malicious VBA likely lives in the remote target, not in this file."
|
|
)
|
|
|
|
ole_remote = [r for r in external if r['relationship'] == 'oleObject']
|
|
if ole_remote:
|
|
info['detection_notes'].append(
|
|
f"{len(ole_remote)} external OLE-object reference"
|
|
f"{'s' if len(ole_remote) != 1 else ''} -- remote-fetched embedded payload"
|
|
)
|
|
|
|
subdoc = [r for r in external if r['relationship'] == 'subDocument']
|
|
if subdoc:
|
|
info['detection_notes'].append(
|
|
f"{len(subdoc)} external subDocument reference"
|
|
f"{'s' if len(subdoc) != 1 else ''}"
|
|
)
|
|
|
|
|
|
def _scan_external_relationships(filepath: str) -> List[Dict]:
|
|
"""Walk every `*.rels` file inside an OOXML container and return the list
|
|
of relationships whose `TargetMode` is `External` AND whose Type is one
|
|
of `_INTERESTING_RELS`. Returns `[]` for non-zip files (legacy CFBF
|
|
.doc/.xls binaries).
|
|
"""
|
|
if not zipfile.is_zipfile(filepath):
|
|
return []
|
|
|
|
findings: List[Dict] = []
|
|
try:
|
|
with zipfile.ZipFile(filepath) as z:
|
|
rels_files = [n for n in z.namelist() if n.endswith('.rels')]
|
|
for rels_name in rels_files:
|
|
try:
|
|
data = z.read(rels_name)
|
|
except Exception:
|
|
continue
|
|
try:
|
|
root = ET.fromstring(data)
|
|
except ET.ParseError:
|
|
continue
|
|
|
|
for rel in root.iter():
|
|
tag = rel.tag.rsplit('}', 1)[-1] if '}' in rel.tag else rel.tag
|
|
if tag != 'Relationship':
|
|
continue
|
|
if rel.attrib.get('TargetMode', '').lower() != 'external':
|
|
continue
|
|
rel_type = rel.attrib.get('Type', '')
|
|
target = rel.attrib.get('Target', '')
|
|
rel_name = rel_type.rsplit('/', 1)[-1] if '/' in rel_type else rel_type
|
|
if rel_name not in _INTERESTING_RELS:
|
|
continue
|
|
findings.append({
|
|
'rels_file': rels_name,
|
|
'relationship': rel_name,
|
|
'target': target,
|
|
'target_mode': 'External',
|
|
'full_type': rel_type,
|
|
})
|
|
except zipfile.BadZipFile:
|
|
pass
|
|
|
|
return findings
|