Files
litterbox/app/utils/office.py
T

221 lines
8.0 KiB
Python

# app/utils/office.py
"""Office document analyzer.
Runs at upload time on Word / Excel / RTF / legacy CFBF binaries (alongside
`get_pe_info`, `get_lnk_info`, `get_html_smuggle_info`). Output lands in
`file_info.office_info`.
Two analysis branches:
1. olevba -- VBA / XLM macros embedded in the file. Pulls per-module
source, autoexec triggers, suspicious keyword hits, IOCs.
2. OOXML rels inspection -- external `attachedTemplate` / `oleObject` /
`subDocument` / `frame` references. Catches T1221 (Remote Template
Injection) which is invisible to olevba because the malicious VBA
lives in a remote .dotm, not in the file itself. Atomic Red Team's
`Calculator.docx` is the canonical example.
"""
import logging
import xml.etree.ElementTree as ET
import zipfile
from typing import Dict, List
from oletools.olevba import VBA_Parser
logger = logging.getLogger(__name__)
# Relationship Types we care about when they target an external (HTTP/UNC)
# resource. `attachedTemplate` is the T1221 vector. The others pull remote
# content the same way; less common but the same class of risk.
_INTERESTING_RELS = (
'attachedTemplate',
'oleObject',
'subDocument',
'frame',
'image', # rare but seen in malicious docs that fetch tracking pixels
'hyperlink',
)
def get_office_info(filepath: str, malapi_path=None) -> Dict:
"""Public entry. Returns `{office_info: {...}}` -- mirrors `get_lnk_info` /
`get_html_smuggle_info` shape so file_io can do `file_info.update(result)`
without conditionals.
`malapi_path` is accepted for back-compat with the old SecurityAnalyzer
delegation but isn't used here -- the office analyzer doesn't need
MalAPI lookups.
"""
info = {
'file_type': 'Microsoft Office Document',
'has_macros': False,
'modules': [], # [{stream, vba_filename, code}]
'analysis': {
'autoexec': [], # [{keyword, description}] auto-execution triggers
'suspicious': [], # [{keyword, description}] suspicious keyword hits
'iocs': [], # [{type, value}] extracted URLs / IPs / EXEs / etc.
'hex_strings': [],
'base64_strings': [],
'vba_strings': [],
},
'external_refs': [], # external relationship targets (T1221 etc.)
'detection_notes': [],
}
_run_olevba(filepath, info)
_run_external_rels(filepath, info)
return {'office_info': info}
def _run_olevba(filepath: str, info: Dict) -> None:
"""Branch 1 -- VBA / XLM macro analysis via oletools.olevba."""
try:
vbaparser = VBA_Parser(filepath)
except Exception as e:
logger.warning(f"olevba init failed on {filepath}: {e}")
return
try:
if not vbaparser.detect_vba_macros():
return
info['has_macros'] = True
# Per-module source code: (filename, stream_path, vba_filename, vba_code)
for _, stream, vba_fname, vba_code in vbaparser.extract_macros():
if vba_code:
info['modules'].append({
'stream': stream,
'vba_filename': vba_fname,
'code': vba_code,
})
# Structured analysis -- olevba returns (kw_type, keyword, description)
for kw_type, keyword, description in vbaparser.analyze_macros():
kt = (kw_type or '').lower()
entry = {'keyword': keyword, 'description': description}
if kt == 'autoexec':
info['analysis']['autoexec'].append(entry)
elif kt == 'suspicious':
info['analysis']['suspicious'].append(entry)
elif kt == 'iocs':
info['analysis']['iocs'].append({'type': keyword, 'value': description})
elif kt == 'hex string':
info['analysis']['hex_strings'].append(entry)
elif kt == 'base64 string':
info['analysis']['base64_strings'].append(entry)
elif kt in ('vba string', 'vba_string'):
info['analysis']['vba_strings'].append(entry)
a = info['analysis']
if a['autoexec']:
info['detection_notes'].append(
f"{len(a['autoexec'])} auto-execution trigger"
f"{'s' if len(a['autoexec']) != 1 else ''} detected"
)
if a['suspicious']:
info['detection_notes'].append(
f"{len(a['suspicious'])} suspicious keyword"
f"{'s' if len(a['suspicious']) != 1 else ''} in macro body"
)
if a['iocs']:
info['detection_notes'].append(
f"{len(a['iocs'])} IOC"
f"{'s' if len(a['iocs']) != 1 else ''} extracted from macro"
)
except Exception as e:
logger.warning(f"olevba analysis failed on {filepath}: {e}")
finally:
try:
vbaparser.close()
except Exception:
pass
def _run_external_rels(filepath: str, info: Dict) -> None:
"""Branch 2 -- T1221 / external-relationship inspection."""
try:
external = _scan_external_relationships(filepath)
except Exception as e:
logger.warning(f"External-rels scan failed on {filepath}: {e}")
return
if not external:
return
info['external_refs'] = external
t1221 = [r for r in external if r['relationship'] == 'attachedTemplate']
if t1221:
info['detection_notes'].append(
f"MITRE T1221: Remote Template Injection -- {len(t1221)} "
f"external `attachedTemplate` reference"
f"{'s' if len(t1221) != 1 else ''}. "
f"Malicious VBA likely lives in the remote target, not in this file."
)
ole_remote = [r for r in external if r['relationship'] == 'oleObject']
if ole_remote:
info['detection_notes'].append(
f"{len(ole_remote)} external OLE-object reference"
f"{'s' if len(ole_remote) != 1 else ''} -- remote-fetched embedded payload"
)
subdoc = [r for r in external if r['relationship'] == 'subDocument']
if subdoc:
info['detection_notes'].append(
f"{len(subdoc)} external subDocument reference"
f"{'s' if len(subdoc) != 1 else ''}"
)
def _scan_external_relationships(filepath: str) -> List[Dict]:
"""Walk every `*.rels` file inside an OOXML container and return the list
of relationships whose `TargetMode` is `External` AND whose Type is one
of `_INTERESTING_RELS`. Returns `[]` for non-zip files (legacy CFBF
.doc/.xls binaries).
"""
if not zipfile.is_zipfile(filepath):
return []
findings: List[Dict] = []
try:
with zipfile.ZipFile(filepath) as z:
rels_files = [n for n in z.namelist() if n.endswith('.rels')]
for rels_name in rels_files:
try:
data = z.read(rels_name)
except Exception:
continue
try:
root = ET.fromstring(data)
except ET.ParseError:
continue
for rel in root.iter():
tag = rel.tag.rsplit('}', 1)[-1] if '}' in rel.tag else rel.tag
if tag != 'Relationship':
continue
if rel.attrib.get('TargetMode', '').lower() != 'external':
continue
rel_type = rel.attrib.get('Type', '')
target = rel.attrib.get('Target', '')
rel_name = rel_type.rsplit('/', 1)[-1] if '/' in rel_type else rel_type
if rel_name not in _INTERESTING_RELS:
continue
findings.append({
'rels_file': rels_name,
'relationship': rel_name,
'target': target,
'target_mode': 'External',
'full_type': rel_type,
})
except zipfile.BadZipFile:
pass
return findings