# app/utils/file_io.py """File ingestion: type detection, PE/Office/LNK metadata, upload handling.""" import datetime import hashlib import json import mimetypes import os import pathlib import struct import pefile from werkzeug.utils import secure_filename from ..analyzers.static.lnk_parser import LnkForensics from .forensics import calculate_entropy, get_security_analyzer from .risk_analyzer import RiskCalculator class FileTypeDetector: """Detect file format from magic bytes and internal structure.""" MZ = b"MZ" CFBF = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" ZIP_PK = b"PK\x03\x04" LNK_HEADER = b"\x4C\x00\x00\x00" PE_MACHINES = {0x14c: "x86", 0x8664: "x64", 0x1c0: "ARM", 0xaa64: "ARM64"} @classmethod def detect_file_type(cls, filepath): try: p = pathlib.Path(filepath) with p.open('rb') as fp: header = fp.read(20) if header.startswith(cls.MZ): return cls._detect_pe_type(p) elif header.startswith(cls.CFBF): return cls._detect_ole_type(filepath) elif header.startswith(cls.ZIP_PK): return cls._detect_zip_type(filepath) elif header.startswith(cls.LNK_HEADER): return cls._detect_lnk_type(filepath) return {"family": "unknown", "type": "unknown"} except Exception as e: return {"family": "error", "type": str(e)} @classmethod def _detect_lnk_type(cls, filepath): try: with open(filepath, 'rb') as f: header = f.read(76) lnk_guid = b"\x01\x14\x02\x00\x00\x00\x00\x00\xC0\x00\x00\x00\x00\x00\x00\x46" if len(header) >= 20 and header[4:20] == lnk_guid: return {"family": "lnk", "type": "windows_shortcut"} else: return {"family": "lnk", "type": "invalid"} except Exception: return {"family": "lnk", "type": "error"} @classmethod def _detect_pe_type(cls, path): try: with path.open('rb') as fp: fp.seek(0x3C) pe_offset = struct.unpack(' 7.2 for section in sections_info): detection_notes.append( 'High entropy sections detected - Consider entropy reduction techniques' ) text_sections = [s for s in sections_info if s['name'] == '.text'] if text_sections and text_sections[0]['entropy'] > 7.0: detection_notes.append('Packed/encrypted code section may trigger heuristics') if any(not section['is_standard'] for section in sections_info): detection_notes.append( 'Non-standard PE sections detected - May trigger static analysis' ) return detection_notes def get_office_info(filepath, malapi_path): """Analyze Office macros (delegates to SecurityAnalyzer).""" return get_security_analyzer(malapi_path).analyze_office_macros(filepath) def get_lnk_info(filepath): """Analyze a Windows .LNK shortcut for forensic data.""" try: lnk = LnkForensics(filepath) if not lnk.is_valid(): return {'lnk_info': None} forensic_data = lnk.get_forensic_data() return {'lnk_info': forensic_data} except Exception as e: print(f"Error analyzing LNK file: {e}") return {'lnk_info': None} def _build_entropy_analysis(entropy_value): analysis = { 'value': entropy_value, 'detection_risk': ( 'High' if entropy_value > 7.2 else 'Medium' if entropy_value > 6.8 else 'Low' ), 'notes': [], } if entropy_value > 7.2: analysis['notes'].append( 'High entropy indicates encryption/packing - consider entropy reduction' ) elif entropy_value > 6.8: analysis['notes'].append('Moderate entropy - may trigger basic detection') return analysis def save_uploaded_file(file, config): """Persist an uploaded file, compute hashes/entropy/PE info, and write file_info.json.""" file_content = file.read() file.close() md5_hash = hashlib.md5(file_content).hexdigest() sha256_hash = hashlib.sha256(file_content).hexdigest() original_filename = secure_filename(file.filename) filename = f"{md5_hash}_{original_filename}" upload_folder = config['utils']['upload_folder'] result_folder = config['utils']['result_folder'] malapi_path = config['utils']['malapi_path'] os.makedirs(upload_folder, exist_ok=True) filepath = os.path.join(upload_folder, filename) os.makedirs(result_folder, exist_ok=True) os.makedirs(os.path.join(result_folder, filename), exist_ok=True) with open(filepath, 'wb') as f: f.write(file_content) entropy_value = calculate_entropy(file_content) file_type_info = detect_file_type(filepath) file_info = { 'original_name': original_filename, 'md5': md5_hash, 'sha256': sha256_hash, 'size': len(file_content), 'extension': file_type_info['type'], 'mime_type': mimetypes.guess_type(original_filename)[0] or 'application/octet-stream', 'upload_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'entropy': entropy_value, 'entropy_analysis': _build_entropy_analysis(entropy_value), 'detected_type': file_type_info, } if file_type_info['family'] == 'pe': file_info.update(get_pe_info(filepath, malapi_path)) if file_info.get('pe_info'): pe_info = file_info['pe_info'] build_with = pe_info.get('build_with') risk_score = 0 risk_factors = [] if build_with in ['go', 'rust']: risk_score = 15 risk_factors.append( f"Binary built with {build_with.upper()} - Runtime imports expected" ) else: pe_risk, pe_factors = RiskCalculator.calculate_pe_risk(pe_info) risk_score = pe_risk risk_factors.extend(pe_factors) if risk_score >= 75: risk_level = "Critical" elif risk_score >= 50: risk_level = "High" elif risk_score >= 25: risk_level = "Medium" else: risk_level = "Low" file_info['risk_assessment'] = { 'score': risk_score, 'level': risk_level, 'factors': risk_factors, } elif file_type_info['family'] == 'office': office_result = get_office_info(filepath, malapi_path) if 'error' not in office_result: file_info.update(office_result) elif file_type_info['family'] == 'lnk': lnk_result = get_lnk_info(filepath) if 'error' not in lnk_result: file_info.update(lnk_result) with open(os.path.join(result_folder, filename, 'file_info.json'), 'w') as f: json.dump(file_info, f) return file_info