litterbox/app/utils/office.py

# app/utils/office.py
"""Office document analyzer.

Runs at upload time on Word / Excel / RTF / legacy CFBF binaries (alongside
`get_pe_info`, `get_lnk_info`, `get_html_smuggle_info`). Output lands in
`file_info.office_info`.

Two analysis branches:

  1. olevba -- VBA / XLM macros embedded in the file. Pulls per-module
     source, autoexec triggers, suspicious keyword hits, IOCs.

  2. OOXML rels inspection -- external `attachedTemplate` / `oleObject` /
     `subDocument` / `frame` references. Catches T1221 (Remote Template
     Injection) which is invisible to olevba because the malicious VBA
     lives in a remote .dotm, not in the file itself. Atomic Red Team's
     `Calculator.docx` is the canonical example.
"""

import logging
import xml.etree.ElementTree as ET
import zipfile
from typing import Dict, List

from oletools.olevba import VBA_Parser

logger = logging.getLogger(__name__)


# Relationship Types we care about when they target an external (HTTP/UNC)
# resource. `attachedTemplate` is the T1221 vector. The others pull remote
# content the same way; less common but the same class of risk.
_INTERESTING_RELS = (
    'attachedTemplate',
    'oleObject',
    'subDocument',
    'frame',
    'image',          # rare but seen in malicious docs that fetch tracking pixels
    'hyperlink',
)


def get_office_info(filepath: str, malapi_path=None) -> Dict:
    """Public entry. Returns `{office_info: {...}}` -- mirrors `get_lnk_info` /
    `get_html_smuggle_info` shape so file_io can do `file_info.update(result)`
    without conditionals.

    `malapi_path` is accepted for back-compat with the old SecurityAnalyzer
    delegation but isn't used here -- the office analyzer doesn't need
    MalAPI lookups.
    """
    info = {
        'file_type': 'Microsoft Office Document',
        'has_macros': False,
        'modules': [],          # [{stream, vba_filename, code}]
        'analysis': {
            'autoexec': [],     # [{keyword, description}] auto-execution triggers
            'suspicious': [],   # [{keyword, description}] suspicious keyword hits
            'iocs': [],         # [{type, value}] extracted URLs / IPs / EXEs / etc.
            'hex_strings': [],
            'base64_strings': [],
            'vba_strings': [],
        },
        'external_refs': [],    # external relationship targets (T1221 etc.)
        'detection_notes': [],
    }

    _run_olevba(filepath, info)
    _run_external_rels(filepath, info)

    return {'office_info': info}


def _run_olevba(filepath: str, info: Dict) -> None:
    """Branch 1 -- VBA / XLM macro analysis via oletools.olevba."""
    try:
        vbaparser = VBA_Parser(filepath)
    except Exception as e:
        logger.warning(f"olevba init failed on {filepath}: {e}")
        return

    try:
        if not vbaparser.detect_vba_macros():
            return

        info['has_macros'] = True

        # Per-module source code: (filename, stream_path, vba_filename, vba_code)
        for _, stream, vba_fname, vba_code in vbaparser.extract_macros():
            if vba_code:
                info['modules'].append({
                    'stream': stream,
                    'vba_filename': vba_fname,
                    'code': vba_code,
                })

        # Structured analysis -- olevba returns (kw_type, keyword, description)
        for kw_type, keyword, description in vbaparser.analyze_macros():
            kt = (kw_type or '').lower()
            entry = {'keyword': keyword, 'description': description}
            if kt == 'autoexec':
                info['analysis']['autoexec'].append(entry)
            elif kt == 'suspicious':
                info['analysis']['suspicious'].append(entry)
            elif kt == 'iocs':
                info['analysis']['iocs'].append({'type': keyword, 'value': description})
            elif kt == 'hex string':
                info['analysis']['hex_strings'].append(entry)
            elif kt == 'base64 string':
                info['analysis']['base64_strings'].append(entry)
            elif kt in ('vba string', 'vba_string'):
                info['analysis']['vba_strings'].append(entry)

        a = info['analysis']
        if a['autoexec']:
            info['detection_notes'].append(
                f"{len(a['autoexec'])} auto-execution trigger"
                f"{'s' if len(a['autoexec']) != 1 else ''} detected"
            )
        if a['suspicious']:
            info['detection_notes'].append(
                f"{len(a['suspicious'])} suspicious keyword"
                f"{'s' if len(a['suspicious']) != 1 else ''} in macro body"
            )
        if a['iocs']:
            info['detection_notes'].append(
                f"{len(a['iocs'])} IOC"
                f"{'s' if len(a['iocs']) != 1 else ''} extracted from macro"
            )
    except Exception as e:
        logger.warning(f"olevba analysis failed on {filepath}: {e}")
    finally:
        try:
            vbaparser.close()
        except Exception:
            pass


def _run_external_rels(filepath: str, info: Dict) -> None:
    """Branch 2 -- T1221 / external-relationship inspection."""
    try:
        external = _scan_external_relationships(filepath)
    except Exception as e:
        logger.warning(f"External-rels scan failed on {filepath}: {e}")
        return

    if not external:
        return

    info['external_refs'] = external

    t1221 = [r for r in external if r['relationship'] == 'attachedTemplate']
    if t1221:
        info['detection_notes'].append(
            f"MITRE T1221: Remote Template Injection -- {len(t1221)} "
            f"external `attachedTemplate` reference"
            f"{'s' if len(t1221) != 1 else ''}. "
            f"Malicious VBA likely lives in the remote target, not in this file."
        )

    ole_remote = [r for r in external if r['relationship'] == 'oleObject']
    if ole_remote:
        info['detection_notes'].append(
            f"{len(ole_remote)} external OLE-object reference"
            f"{'s' if len(ole_remote) != 1 else ''} -- remote-fetched embedded payload"
        )

    subdoc = [r for r in external if r['relationship'] == 'subDocument']
    if subdoc:
        info['detection_notes'].append(
            f"{len(subdoc)} external subDocument reference"
            f"{'s' if len(subdoc) != 1 else ''}"
        )


def _scan_external_relationships(filepath: str) -> List[Dict]:
    """Walk every `*.rels` file inside an OOXML container and return the list
    of relationships whose `TargetMode` is `External` AND whose Type is one
    of `_INTERESTING_RELS`. Returns `[]` for non-zip files (legacy CFBF
    .doc/.xls binaries).
    """
    if not zipfile.is_zipfile(filepath):
        return []

    findings: List[Dict] = []
    try:
        with zipfile.ZipFile(filepath) as z:
            rels_files = [n for n in z.namelist() if n.endswith('.rels')]
            for rels_name in rels_files:
                try:
                    data = z.read(rels_name)
                except Exception:
                    continue
                try:
                    root = ET.fromstring(data)
                except ET.ParseError:
                    continue

                for rel in root.iter():
                    tag = rel.tag.rsplit('}', 1)[-1] if '}' in rel.tag else rel.tag
                    if tag != 'Relationship':
                        continue
                    if rel.attrib.get('TargetMode', '').lower() != 'external':
                        continue
                    rel_type = rel.attrib.get('Type', '')
                    target = rel.attrib.get('Target', '')
                    rel_name = rel_type.rsplit('/', 1)[-1] if '/' in rel_type else rel_type
                    if rel_name not in _INTERESTING_RELS:
                        continue
                    findings.append({
                        'rels_file': rels_name,
                        'relationship': rel_name,
                        'target': target,
                        'target_mode': 'External',
                        'full_type': rel_type,
                    })
    except zipfile.BadZipFile:
        pass

    return findings