800d949482
- /files dashboard backed by per-sample _summary_cache.json with mtime-validated source set; ~8x on a single sample, scales linearly - path_manager.find_file_by_hash keeps a per-folder hash->dirname index validated against folder mtime; ~10x on warm lookups - BYOVD route reads compile_time from file_info.json instead of re-parsing the PE - CHANGELOG entries for the perf cluster
122 lines
4.2 KiB
Python
122 lines
4.2 KiB
Python
# app/services/summary_cache.py
|
|
"""On-disk cache for per-sample summary dicts.
|
|
|
|
The /files dashboard calls `process_file_summary` for every result
|
|
directory. Each call previously did 4-6 sequential JSON reads + a
|
|
fresh `risk_analyzer.calculate_risk` walk over potentially multi-MB
|
|
analyzer outputs. The result is deterministic for a given set of
|
|
on-disk JSONs — perfect for caching.
|
|
|
|
This module persists a tiny `_summary_cache.json` next to the analyzer
|
|
outputs. Each cached entry stamps the mtimes of every source JSON it
|
|
depends on (file_info / static / dynamic / byovd / edr_*); a read
|
|
reconstructs the source mtimes and compares against the stamp. Any
|
|
drift forces a recompute, so the cache stays correct without any
|
|
manual invalidation at write sites.
|
|
|
|
Cache miss (~stale mtimes / no file): caller falls back to the slow
|
|
recompute path and stores the fresh result on the way out.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from typing import Dict, Optional
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Source files whose mtimes determine cache validity. Anything past
|
|
# this list (e.g. report HTML, ad-hoc operator notes) is intentionally
|
|
# outside the dependency set — adding a report doesn't invalidate the
|
|
# summary, since the report is derived from the same JSONs.
|
|
_FIXED_SOURCES = (
|
|
'file_info.json',
|
|
'static_analysis_results.json',
|
|
'dynamic_analysis_results.json',
|
|
'byovd_results.json',
|
|
)
|
|
_EDR_PREFIX = 'edr_'
|
|
_EDR_SUFFIX = '_results.json'
|
|
|
|
CACHE_FILE = '_summary_cache.json'
|
|
|
|
|
|
def get_cached(item_path: str) -> Optional[dict]:
|
|
"""Return a cached summary for `item_path` if its source mtimes
|
|
match the current on-disk state. None on miss / staleness /
|
|
corrupted cache."""
|
|
cache_path = os.path.join(item_path, CACHE_FILE)
|
|
if not os.path.exists(cache_path):
|
|
return None
|
|
try:
|
|
with open(cache_path, 'r', encoding='utf-8') as f:
|
|
cached = json.load(f)
|
|
except (json.JSONDecodeError, OSError) as exc:
|
|
logger.debug(f"Summary cache read failed for {item_path}: {exc}")
|
|
return None
|
|
|
|
saved_sources = cached.get('_sources') or {}
|
|
if saved_sources != _source_mtimes(item_path):
|
|
return None
|
|
|
|
return cached.get('summary')
|
|
|
|
|
|
def store(item_path: str, summary: dict) -> None:
|
|
"""Persist `summary` for `item_path` along with the current source
|
|
mtimes. Failures are logged but not raised — the cache is purely
|
|
a perf optimization and a missing entry just falls through to the
|
|
slow path on the next read."""
|
|
cache_path = os.path.join(item_path, CACHE_FILE)
|
|
payload = {
|
|
'_sources': _source_mtimes(item_path),
|
|
'summary': summary,
|
|
}
|
|
try:
|
|
# Write to a sibling .tmp then rename so a crash mid-write
|
|
# never leaves a half-formed cache file behind.
|
|
tmp = cache_path + '.tmp'
|
|
with open(tmp, 'w', encoding='utf-8') as f:
|
|
json.dump(payload, f)
|
|
os.replace(tmp, cache_path)
|
|
except OSError as exc:
|
|
logger.debug(f"Summary cache write failed for {item_path}: {exc}")
|
|
|
|
|
|
def invalidate(item_path: str) -> None:
|
|
"""Remove the cached entry for `item_path`. Idempotent — missing
|
|
cache is fine. The mtime check normally makes manual invalidation
|
|
unnecessary; this is mostly here for cleanup / cleanup endpoints."""
|
|
cache_path = os.path.join(item_path, CACHE_FILE)
|
|
try:
|
|
os.remove(cache_path)
|
|
except FileNotFoundError:
|
|
pass
|
|
except OSError as exc:
|
|
logger.debug(f"Summary cache invalidate failed for {item_path}: {exc}")
|
|
|
|
|
|
# ---- internals ---------------------------------------------------------
|
|
|
|
|
|
def _source_mtimes(item_path: str) -> Dict[str, int]:
|
|
"""Snapshot the mtimes (in nanoseconds) of every source JSON we
|
|
depend on. Discovers per-profile EDR result files dynamically so
|
|
a freshly-added profile invalidates the cache automatically."""
|
|
out: Dict[str, int] = {}
|
|
try:
|
|
entries = os.listdir(item_path)
|
|
except (FileNotFoundError, OSError):
|
|
return out
|
|
for name in entries:
|
|
if name in _FIXED_SOURCES or (
|
|
name.startswith(_EDR_PREFIX) and name.endswith(_EDR_SUFFIX)
|
|
):
|
|
try:
|
|
out[name] = os.stat(os.path.join(item_path, name)).st_mtime_ns
|
|
except OSError:
|
|
pass
|
|
return out
|