From 366e5002e1d5ca53ca9138f9064ac2d587c0811b Mon Sep 17 00:00:00 2001 From: Justin Ibarra Date: Wed, 2 Dec 2020 08:25:33 +0100 Subject: [PATCH] [FR] Add experimental ML DGA CLI support (#361) * Add DGA model commands * Add upload/delete ML job command * Add DGA release management commands * Add Manifest handling * Add GithubClient object --- detection_rules/devtools.py | 221 ++++++++++++++++++++- detection_rules/eswrap.py | 327 +++++++++++++++++++++++++++++++- detection_rules/misc.py | 312 ++++++++++++++++++++++++++++++ detection_rules/rule.py | 2 +- detection_rules/rule_loader.py | 1 + detection_rules/schemas/base.py | 4 +- docs/ML_DGA.md | 176 +++++++++++++++++ requirements-dev.txt | 1 + 8 files changed, 1038 insertions(+), 6 deletions(-) create mode 100644 docs/ML_DGA.md create mode 100644 requirements-dev.txt diff --git a/detection_rules/devtools.py b/detection_rules/devtools.py index 0f1ccc211..c62221e57 100644 --- a/detection_rules/devtools.py +++ b/detection_rules/devtools.py @@ -4,12 +4,14 @@ """CLI commands for internal detection_rules dev team.""" import glob +import hashlib import io import json import os import shutil import subprocess import time +from pathlib import Path import click from elasticsearch import Elasticsearch @@ -19,7 +21,7 @@ from kibana.connector import Kibana from . import rule_loader from .eswrap import CollectEvents, add_range_to_dsl from .main import root -from .misc import PYTHON_LICENSE, add_client, client_error +from .misc import PYTHON_LICENSE, add_client, GithubClient, Manifest, client_error, getdefault from .packaging import PACKAGE_FILE, Package, manage_versions, RELEASE_DIR from .rule import Rule from .rule_loader import get_rule @@ -354,3 +356,220 @@ def rule_survey(ctx: click.Context, query, date_range, dump_file, hide_zero_coun json.dump(details, f, indent=2, sort_keys=True) return survey_results + + +@dev_group.group('gh-release') +def gh_release_group(): + """Commands to manage GitHub releases.""" + + +@gh_release_group.command('create-ml') +@click.argument('directory', type=click.Path(dir_okay=True, file_okay=False)) +@click.option('--gh-token', '-t', default=getdefault('gh_token')) +@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo') +@click.option('--release-name', '-n', required=True, help='Name of release') +@click.option('--description', '-d', help='Description of release to append to default message') +@click.pass_context +def create_ml_release(ctx, directory, gh_token, repo, release_name, description): + """Create a GitHub release.""" + import re + + # ML-DGA-20201129-25 + pattern = r'^(ML-DGA|ML-experimental-detections)-\d{4}\d{2}\d{2}-\d+$' + assert re.match(pattern, release_name), f'release name must match pattern: {pattern}' + assert Path(directory).name == release_name, f'directory name must match release name: {release_name}' + + gh_token = gh_token or click.prompt('GitHub token', hide_input=True) + client = GithubClient(gh_token) + gh_repo = client.authenticated_client.get_repo(repo) + + # validate tag name is increment by 1 + name_prefix, _, version = release_name.rsplit('-', 2) + version = int(version) + releases = gh_repo.get_releases() + max_ver = max([int(r.raw_data['name'].split('-')[-1]) for r in releases + if r.raw_data['name'].startswith(name_prefix)], default=0) + + if version != (max_ver + 1): + client_error(f'Last release version was {max_ver}. Release name should end with version: {max_ver + 1}') + + # validate files + if name_prefix == 'ML-DGA': + zipped_bundle, description_str = ctx.invoke(validate_ml_dga_asset, directory=directory, repo=repo) + else: + zipped_bundle, description_str = ctx.invoke(validate_ml_detections_asset, directory=directory) + + click.confirm('Validation passed, verify output. Continue?') + + if description: + description_str = f'{description_str}\n\n----\n\n{description}' + + release = gh_repo.create_git_release(name=release_name, tag=release_name, message=description_str) + zip_name = Path(zipped_bundle).name + + click.echo(f'release created at: {release.html_url}') + + # add zipped bundle as an asset to the release + click.echo(f'Uploading zip file: {zip_name}') + release.upload_asset(zipped_bundle, label=zip_name, name=zip_name, content_type='application/zip') + + # create manifest entry + click.echo('creating manifest for release') + manifest = Manifest(repo, tag_name=release_name, token=gh_token) + manifest.save() + + return release + + +@gh_release_group.command('validate-ml-dga-asset') +@click.argument('directory', type=click.Path(exists=True, file_okay=False)) +@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo') +def validate_ml_dga_asset(directory, repo): + """"Validate and prep an ML DGA bundle for release.""" + from .eswrap import expected_ml_dga_patterns + + now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + + files = list(Path(directory).glob('*')) + if len(files) > 5: + client_error('Too many files, expected 5') + + click.secho('[*] validated expected number of files', fg='green') + + # backup files - will re-save sorted to have deterministic hash + backup_path = Path(directory).resolve().parent.joinpath(f'backups-{Path(directory).name}-{now.replace(":", "-")}') + shutil.copytree(directory, backup_path) + + # validate file names and json and load + loaded_contents = {} + for name, pattern in expected_ml_dga_patterns.items(): + path = list(Path(directory).glob(pattern)) + match_count = len(path) + if match_count != 1: + client_error(f'Expected filename pattern "{pattern}" for "{name}": {match_count} matches detected') + + file_path = path[0] + try: + with open(file_path, 'r') as f: + contents = json.dumps(json.load(f), sort_keys=True) + loaded_contents[name] = {'contents': contents, 'filename': file_path} + + sha256 = hashlib.sha256(contents.encode('utf-8')).hexdigest() + click.secho(f' - sha256: {sha256} - {name}') + + # re-save sorted + with open(file_path, 'w') as f: + f.write(contents) + except json.JSONDecodeError as e: + client_error(f'Invalid JSON in {file_path} file', e) + + model_filename = Path(loaded_contents['model']['filename']).name + model_name, _ = model_filename.rsplit('_', maxsplit=1) + + click.secho('[*] re-saved all files with keys sorted for deterministic hashing', fg='green') + click.secho(f' [+] backups saved to: {backup_path}') + click.secho('[*] validated expected naming patterns for all files', fg='green') + click.secho('[*] validated json formatting of all files', fg='green') + + # check manifest for existing things + existing_sha = False + existing_model_name = False + model_hash = hashlib.sha256(loaded_contents['model']['contents'].encode('utf-8')).hexdigest() + manifest_hashes = Manifest.get_existing_asset_hashes(repo) + + for release, file_data in manifest_hashes.items(): + for file_name, sha in file_data.items(): + if model_hash == sha: + existing_sha = True + click.secho(f'[!] hash for model file: "{loaded_contents["model"]["filename"]}" matches: ' + f'{release} -> {file_name} -> {sha}', fg='yellow') + + if model_filename == file_name: + existing_model_name = True + client_error(f'name for model file: "{loaded_contents["model"]["filename"]}" matches: ' + f'{release} -> {file_name} -> {file_name}') + + if not existing_sha: + click.secho(f'[+] validated no existing models matched hashes for: ' + f'{loaded_contents["model"]["filename"]}', fg='green') + + if not existing_model_name: + click.secho(f'[+] validated no existing models matched names for: ' + f'{loaded_contents["model"]["filename"]}', fg='green') + + # save zip + zip_name_no_ext = Path(directory).resolve() + zip_name = f'{zip_name_no_ext}.zip' + shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name) + click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green') + + click.secho(f'[!] run `setup-dga-model -d {directory}` to test this on a live stack before releasing', fg='yellow') + + description = { + 'model_name': model_name + '\n\n----\n\n', + 'date': now, + 'model_sha256': model_hash, + 'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md' + } + description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()]) + click.echo() + click.echo(f'[*] description to paste with release:\n\n{description_str}\n') + + return zip_name, description_str + + +@gh_release_group.command('validate-ml-detections-asset') +@click.argument('directory', type=click.Path(exists=True, file_okay=False)) +def validate_ml_detections_asset(directory): + """Validate and prep ML detection rules and jobs before release.""" + import pytoml + + now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + + job_paths = list(Path(directory).glob('*.json')) + rule_paths = list(Path(directory).glob('*.toml')) + job_count = len(job_paths) + rule_count = len(rule_paths) + + for job in job_paths: + try: + with open(job, 'r') as f: + j = json.load(f) + assert j.get('name'), click.style(f'[!] job file "{job}" missing: name', fg='red') + assert j.get('type'), click.style(f'[!] job file "{job}" missing: type', fg='red') + assert j.get('body'), click.style(f'[!] job file "{job}" missing: body', fg='red') + except json.JSONDecodeError as e: + client_error(f'Invalid JSON in {job} file', e) + + click.secho(f'[*] validated json formatting and required fields in {job_count} job files', fg='green') + + for rule in rule_paths: + with open(rule, 'r') as f: + try: + pytoml.load(f) + except pytoml.TomlError as e: + client_error(f'[!] invalid rule toml for: {rule}', e) + + click.secho(f'[*] validated toml formatting for {rule_count} rule files', fg='green') + + # save zip + zip_name_no_ext = Path(directory).resolve() + zip_name = f'{zip_name_no_ext}.zip' + shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name) + click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green') + + click.secho('[!] run `kibana upload-rule` to test rules on a live stack before releasing', fg='green') + click.secho('[!] run `es upload-ml-job` to test jobs on a live stack before releasing', fg='green') + + description = { + 'Experimental ML rules': rule_count, + 'Experimental ML jobs': str(job_count) + '\n\n----\n\n', + 'DGA release': '', + 'date': now, + 'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md' + } + description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()]) + click.echo() + click.echo(f'description to paste with release:\n\n{description_str}\n') + + return zip_name, description_str diff --git a/detection_rules/eswrap.py b/detection_rules/eswrap.py index a3ecd4c9d..3774b7448 100644 --- a/detection_rules/eswrap.py +++ b/detection_rules/eswrap.py @@ -6,13 +6,15 @@ import json import os import time +from contextlib import contextmanager from collections import defaultdict +from pathlib import Path from typing import Union import click import elasticsearch from elasticsearch import Elasticsearch -from elasticsearch.client import AsyncSearchClient +from elasticsearch.client import AsyncSearchClient, IngestClient, LicenseClient, MlClient import kql from .main import root @@ -34,9 +36,11 @@ def get_elasticsearch_client(cloud_id=None, elasticsearch_url=None, es_user=None es_user = es_user or click.prompt("es_user") es_password = es_password or click.prompt("es_password", hide_input=True) hosts = [elasticsearch_url] if elasticsearch_url else None + timeout = kwargs.pop('timeout', 60) try: - client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(es_user, es_password), **kwargs) + client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(es_user, es_password), timeout=timeout, + **kwargs) # force login to test auth client.info() return client @@ -384,3 +388,322 @@ def collect_events(ctx, host_id, query, index, rta_name, rule_id, view_events): except AssertionError as e: error_msg = 'No events collected! Verify events are streaming and that the agent-hostname is correct' client_error(error_msg, e, ctx=ctx) + + +@es_group.group('experimental') +def es_experimental(): + """[Experimental] helper commands for integrating with Elasticsearch.""" + click.secho('\n* experimental commands are use at your own risk and may change without warning *\n') + + +@es_experimental.command('check-model-files') +@click.pass_context +def check_model_files(ctx): + """Check ML model files on an elasticsearch instance.""" + from elasticsearch.client import IngestClient, MlClient + from .misc import get_ml_model_manifests_by_model_id + + es_client: Elasticsearch = ctx.obj['es'] + ml_client = MlClient(es_client) + ingest_client = IngestClient(es_client) + + def safe_get(func, arg): + try: + return func(arg) + except elasticsearch.NotFoundError: + return None + + models = [m for m in ml_client.get_trained_models().get('trained_model_configs', []) + if m['created_by'] != '_xpack'] + + if models: + if len([m for m in models if m['model_id'].startswith('dga_')]) > 1: + click.secho('Multiple DGA models detected! It is not recommended to run more than one DGA model at a time', + fg='yellow') + + manifests = get_ml_model_manifests_by_model_id() + + click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:') + for model in models: + manifest = manifests.get(model['model_id']) + click.echo(f' - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}') + else: + click.echo('No DGA Models found') + + support_files = { + 'create_script': safe_get(es_client.get_script, 'dga_ngrams_create'), + 'delete_script': safe_get(es_client.get_script, 'dga_ngrams_transform_delete'), + 'enrich_pipeline': safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'), + 'inference_pipeline': safe_get(ingest_client.get_pipeline, 'dns_dga_inference_enrich_pipeline') + } + + click.echo('Support Files:') + for support_file, results in support_files.items(): + click.echo(f' - {support_file}: {"found" if results else "not found"}') + + +@es_experimental.command('remove-dga-model') +@click.argument('model-id') +@click.option('--force', '-f', is_flag=True, help='Force the attempted delete without checking if model exists') +@click.pass_context +def remove_dga_model(ctx, model_id, force, es_client: Elasticsearch = None, ml_client: MlClient = None, + ingest_client: IngestClient = None): + """Remove ML DGA files.""" + from elasticsearch.client import IngestClient, MlClient + + es_client = es_client or ctx.obj['es'] + ml_client = ml_client or MlClient(es_client) + ingest_client = ingest_client or IngestClient(es_client) + + def safe_delete(func, fid, verbose=True): + try: + func(fid) + except elasticsearch.NotFoundError: + return False + if verbose: + click.echo(f' - {fid} deleted') + return True + + model_exists = False + if not force: + existing_models = ml_client.get_trained_models() + model_exists = model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])] + + if model_exists or force: + if model_exists: + click.secho('[-] Existing model detected - deleting files', fg='yellow') + + deleted = [ + safe_delete(ingest_client.delete_pipeline, 'dns_dga_inference_enrich_pipeline'), + safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'), + safe_delete(es_client.delete_script, 'dga_ngrams_transform_delete'), + # f'{model_id}_dga_ngrams_transform_delete' + safe_delete(es_client.delete_script, 'dga_ngrams_create'), + # f'{model_id}_dga_ngrams_create' + safe_delete(ml_client.delete_trained_model, model_id) + ] + + if not any(deleted): + click.echo('No files deleted') + else: + click.echo(f'Model: {model_id} not found') + + +expected_ml_dga_patterns = { + 'model': 'dga_*_model.json', # noqa: E241 + 'dga_ngrams_create': 'dga_*_ngrams_create.json', # noqa: E241 + 'dga_ngrams_transform_delete': 'dga_*_ngrams_transform_delete.json', # noqa: E241 + 'dns_enrich_pipeline': 'dga_*_ingest_pipeline1.json', # noqa: E241 + 'dns_dga_inference_enrich_pipeline': 'dga_*_ingest_pipeline2.json' # noqa: E241 +} + + +@es_experimental.command('setup-dga-model') +@click.option('--model-tag', '-t', + help='Release tag for model files staged in detection-rules (required to download files)') +@click.option('--repo', '-r', default='elastic/detection-rules', + help='GitHub repository hosting the model file releases (owner/repo)') +@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False), + help='Directory containing local model files') +@click.option('--overwrite', is_flag=True, help='Overwrite all files if already in the stack') +@click.pass_context +def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite): + """Upload ML DGA model and dependencies and enrich DNS data.""" + import io + import requests + import shutil + import zipfile + + es_client: Elasticsearch = ctx.obj['es'] + client_info = es_client.info() + license_client = LicenseClient(es_client) + + if license_client.get()['license']['type'].lower() not in ('platinum', 'enterprise'): + client_error('You must have a platinum or enterprise subscription in order to use these ML features') + + # download files if necessary + if not model_dir: + if not model_tag: + client_error('model-tag or model-dir required to download model files') + + click.echo(f'Downloading artifact: {model_tag}') + + release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}' + release = requests.get(release_url) + release.raise_for_status() + assets = [a for a in release.json()['assets'] if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')] + + if len(assets) != 1: + client_error(f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!') + + zipped_url = assets[0]['browser_download_url'] + zipped = requests.get(zipped_url) + z = zipfile.ZipFile(io.BytesIO(zipped.content)) + + dga_dir = get_path('ML-models', 'DGA') + model_dir = os.path.join(dga_dir, model_tag) + os.makedirs(dga_dir, exist_ok=True) + shutil.rmtree(model_dir, ignore_errors=True) + z.extractall(dga_dir) + click.echo(f'files saved to {model_dir}') + + # read files as needed + z.close() + + def get_model_filename(pattern): + paths = list(Path(model_dir).glob(pattern)) + if not paths: + client_error(f'{model_dir} missing files matching the pattern: {pattern}') + if len(paths) > 1: + client_error(f'{model_dir} contains multiple files matching the pattern: {pattern}') + + return paths[0] + + @contextmanager + def open_model_file(name): + pattern = expected_ml_dga_patterns[name] + with open(get_model_filename(pattern), 'r') as f: + yield json.load(f) + + model_id, _ = os.path.basename(get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1) + + click.echo(f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})') + + # upload model + ml_client = MlClient(es_client) + ingest_client = IngestClient(es_client) + + existing_models = ml_client.get_trained_models() + if model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]: + if overwrite: + ctx.invoke(remove_dga_model, model_id=model_id, es_client=es_client, ml_client=ml_client, + ingest_client=ingest_client, force=True) + else: + client_error(f'Model: {model_id} already exists on stack! Try --overwrite to force the upload') + + click.secho('[+] Uploading model (may take a while)') + + with open_model_file('model') as model_file: + try: + ml_client.put_trained_model(model_id=model_id, body=model_file) + except elasticsearch.ConnectionTimeout: + msg = 'Connection timeout, try increasing timeout using `es --timeout experimental setup_dga_model`.' + client_error(msg) + + # install scripts + click.secho('[+] Uploading painless scripts') + + with open_model_file('dga_ngrams_create') as painless_install: + es_client.put_script(id='dga_ngrams_create', body=painless_install) + # f'{model_id}_dga_ngrams_create' + + with open_model_file('dga_ngrams_transform_delete') as painless_delete: + es_client.put_script(id='dga_ngrams_transform_delete', body=painless_delete) + # f'{model_id}_dga_ngrams_transform_delete' + + # Install ingest pipelines + click.secho('[+] Uploading pipelines') + + def _build_es_script_error(err, pipeline_file): + error = err.info['error'] + cause = error['caused_by'] + + error_msg = [ + f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}', + ' '.join(f'{k}: {v}' for k, v in error['position'].items()), + '\n'.join(error['script_stack']) + ] + + return click.style('\n'.join(error_msg), fg='red') + + with open_model_file('dns_enrich_pipeline') as ingest_pipeline1: + try: + ingest_client.put_pipeline(id='dns_enrich_pipeline', body=ingest_pipeline1) + except elasticsearch.RequestError as e: + if e.error == 'script_exception': + client_error(_build_es_script_error(e, 'ingest_pipeline1'), e, ctx=ctx) + else: + raise + + with open_model_file('dns_dga_inference_enrich_pipeline') as ingest_pipeline2: + try: + ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline', body=ingest_pipeline2) + except elasticsearch.RequestError as e: + if e.error == 'script_exception': + client_error(_build_es_script_error(e, 'ingest_pipeline2'), e, ctx=ctx) + else: + raise + + click.echo('Ensure that you have updated your packetbeat.yml config file.') + click.echo(' - reference: ML_DGA.md #2-update-packetbeat-configuration') + click.echo('To upload rules, run: kibana upload-rule ') + click.echo('To upload ML jobs, run: es experimental upload-ml-job ') + + +@es_experimental.command('upload-ml-job') +@click.argument('job-file', type=click.Path(exists=True, dir_okay=False)) +@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name') +@click.pass_context +def upload_ml_job(ctx: click.Context, job_file, overwrite): + """Upload experimental ML jobs.""" + es_client: Elasticsearch = ctx.obj['es'] + ml_client = MlClient(es_client) + + with open(job_file, 'r') as f: + job = json.load(f) + + def safe_upload(func): + try: + func(name, body) + except (elasticsearch.ConflictError, elasticsearch.RequestError) as err: + if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception': + client_error(str(err), err, ctx=ctx) + + if overwrite: + ctx.invoke(delete_ml_job, job_name=name, job_type=job_type) + func(name, body) + else: + client_error(str(err), err, ctx=ctx) + + try: + job_type = job['type'] + name = job['name'] + body = job['body'] + + if job_type == 'anomaly_detection': + safe_upload(ml_client.put_job) + elif job_type == 'data_frame_analytic': + safe_upload(ml_client.put_data_frame_analytics) + elif job_type == 'datafeed': + safe_upload(ml_client.put_datafeed) + else: + client_error(f'Unknown ML job type: {job_type}') + + click.echo(f'Uploaded {job_type} job: {name}') + except KeyError as e: + client_error(f'{job_file} missing required info: {e}') + + +@es_experimental.command('delete-ml-job') +@click.argument('job-name') +@click.argument('job-type') +@click.pass_context +def delete_ml_job(ctx: click.Context, job_name, job_type, verbose=True): + """Remove experimental ML jobs.""" + es_client: Elasticsearch = ctx.obj['es'] + ml_client = MlClient(es_client) + + try: + if job_type == 'anomaly_detection': + ml_client.delete_job(job_name) + elif job_type == 'data_frame_analytic': + ml_client.delete_data_frame_analytics(job_name) + elif job_type == 'datafeed': + ml_client.delete_datafeed(job_name) + else: + client_error(f'Unknown ML job type: {job_type}') + except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e: + client_error(str(e), e, ctx=ctx) + + if verbose: + click.echo(f'Deleted {job_type} job: {job_name}') diff --git a/detection_rules/misc.py b/detection_rules/misc.py index 7ef9949dc..58a3772c1 100644 --- a/detection_rules/misc.py +++ b/detection_rules/misc.py @@ -3,16 +3,39 @@ # you may not use this file except in compliance with the Elastic License. """Misc support.""" +import hashlib +import io import json import os import re +import shutil import time import uuid +import dataclasses + +from dataclasses import dataclass, field +from datetime import datetime from functools import wraps +from pathlib import Path +from typing import Dict, Tuple +from zipfile import ZipFile import click import requests +# this is primarily for type hinting - all use of the github client should come from GithubClient class +try: + from github import Github + from github.Repository import Repository + from github.GitRelease import GitRelease + from github.GitReleaseAsset import GitReleaseAsset +except ImportError: + # for type hinting + Github = None # noqa: N806 + Repository = None # noqa: N806 + GitRelease = None # noqa: N806 + GitReleaseAsset = None # noqa: N806 + from .utils import add_params, cached, get_path _CONFIG = {} @@ -32,6 +55,295 @@ JS_LICENSE = """ """.strip().format("\n".join(' * ' + line for line in LICENSE_LINES)) +def get_gh_release(repo: Repository, release_name=None, tag_name=None) -> GitRelease: + """Get a list of GitHub releases by repo.""" + assert release_name or tag_name, 'Must specify a release_name or tag_name' + + releases = repo.get_releases() + release = next((r for r in releases + if (release_name and release_name == r.title) or (tag_name and tag_name == r.tag_name)), None) + + return release + + +def upload_gh_release_asset(token, asset: bytes, asset_name, repo=None, content_type='application/zip', + release_name=None, tag_name=None, upload_url=None): + """Save a Github relase asset.""" + if not upload_url: + assert repo, 'You must provide a repo name if not providing an upload_url' + + release = get_gh_release(repo, release_name, tag_name) + upload_url = release['upload_url'] + suffix = '{?name,label}' + upload_url = upload_url.replace(suffix, f'?name={asset_name}&label={asset_name}') + + headers = {'content-type': content_type} + r = requests.post(upload_url, auth=('', token), data=asset, headers=headers) + r.raise_for_status() + click.echo(f'Uploaded {asset_name} to release: {r.json()["url"]}') + + +def load_zipped_gh_assets_with_metadata(url) -> Tuple[str, dict]: + """Download and unzip a GitHub assets.""" + response = requests.get(url) + zipped_asset = ZipFile(io.BytesIO(response.content)) + zipped_sha256 = hashlib.sha256(response.content).hexdigest() + + assets = {} + for zipped in zipped_asset.filelist: + if zipped.is_dir(): + continue + + contents = zipped_asset.read(zipped.filename) + sha256 = hashlib.sha256(contents).hexdigest() + + assets[zipped.filename] = { + 'contents': contents, + 'metadata': { + 'compress_size': zipped.compress_size, + # zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday + 'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)), + 'sha256': sha256, + 'size': zipped.file_size, + } + } + + return zipped_sha256, assets + + +def load_json_gh_asset(url) -> dict: + """Load and return the contents of a json asset file.""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def download_gh_asset(url, path, overwrite=False): + """Download and unzip a GitHub asset.""" + zipped = requests.get(url) + z = ZipFile(io.BytesIO(zipped.content)) + + Path(path).mkdir(exist_ok=True) + if overwrite: + shutil.rmtree(path, ignore_errors=True) + + z.extractall(path) + click.echo(f'files saved to {path}') + + z.close() + + +class GithubClient: + """GitHub client wrapper.""" + + def __init__(self, token=None): + """Get an unauthenticated client, verified authenticated client, or a default client.""" + if not Github: + raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`') + + self.client: Github = Github(token) + self.unauthenticated_client = Github() + self.__token = token + self.__authenticated_client = None + + @property + def authenticated_client(self) -> Github: + if not self.__token: + raise ValueError('Token not defined! Re-instantiate with a token or use add_token method') + if not self.__authenticated_client: + self.__authenticated_client = Github(self.__token) + return self.__authenticated_client + + def add_token(self, token): + self.__token = token + + +@dataclass +class AssetManifestEntry: + + compress_size: int + created_at: datetime + name: str + sha256: str + size: int + + +@dataclass +class AssetManifestMetadata: + + relative_url: str + entries: Dict[str, AssetManifestEntry] + zipped_sha256: str + created_at: datetime = field(default_factory=datetime.utcnow) + description: str = None # label + + +@dataclass +class ReleaseManifest: + + assets: Dict[str, AssetManifestMetadata] + assets_url: str + author: str # parsed from GitHub release metadata as: author[login] + created_at: str + html_url: str + id: int + name: str + published_at: str + url: str + zipball_url: str + tag_name: str = None + description: str = None # body + + +class Manifest: + """Manifest handler for GitHub releases.""" + + def __init__(self, repo: str = 'elastic/detection-rules', release_name=None, tag_name=None, token=None): + self.repo_name = repo + self.release_name = release_name + self.tag_name = tag_name + self.gh_client = GithubClient(token) + self.has_token = token is not None + + self.repo: Repository = self.gh_client.client.get_repo(repo) + self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name) + + if not self.release: + raise ValueError(f'No release found for {tag_name or release_name}') + + if not self.release_name: + self.release_name = self.release.title + + self.manifest_name = f'manifest-{self.release_name}.json' + self.assets: dict = self._get_enriched_assets_from_release() + self.release_manifest = self._create() + self.__release_manifest_dict = dataclasses.asdict(self.release_manifest) + self.manifest_size = len(json.dumps(self.__release_manifest_dict)) + + @property + def release_manifest_fl(self): + return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8')) + + def _create(self): + """Create the manifest from GitHub asset metadata and file contents.""" + assets = {} + for asset_name, asset_data in self.assets.items(): + entries = {} + data = asset_data['data'] + metadata = asset_data['metadata'] + + for file_name, file_data in data.items(): + file_metadata = file_data['metadata'] + + name = Path(file_name).name + file_metadata.update(name=name) + + entry = AssetManifestEntry(**file_metadata) + entries[name] = entry + + assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries, + metadata['zipped_sha256'], metadata['created_at'], + metadata['label']) + + release_metadata = self._parse_release_metadata() + release_metadata.update(assets=assets) + release_manifest = ReleaseManifest(**release_metadata) + + return release_manifest + + def _parse_release_metadata(self): + """Parse relevant info from GitHub metadata for release manifest.""" + ignore = ['assets'] + manual_set_keys = ['author', 'description'] + keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys] + parsed = {k: self.release.raw_data[k] for k in keys} + parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login']) + return parsed + + def save(self) -> GitReleaseAsset: + """Save manifest files.""" + if not self.has_token: + raise ValueError('You must provide a token to save a manifest to a GitHub release') + + asset = self.release.upload_asset_from_memory(self.release_manifest_fl, + self.manifest_size, + self.manifest_name) + click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}') + return asset + + @classmethod + def load(cls, name: str, repo: str = 'elastic/detection-rules', token=None) -> dict: + """Load a manifest.""" + gh_client = GithubClient(token) + repo = gh_client.client.get_repo(repo) + release = get_gh_release(repo, tag_name=name) + asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None) + + if asset is not None: + return load_json_gh_asset(asset.browser_download_url) + + @classmethod + def load_all(cls, repo: str = 'elastic/detection-rules', token=None) -> Tuple[Dict[str, dict], list]: + """Load a consolidated manifest.""" + gh_client = GithubClient(token) + repo = gh_client.client.get_repo(repo) + + consolidated = {} + missing = set() + for release in repo.get_releases(): + name = release.tag_name + asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None) + if not asset: + missing.add(name) + else: + consolidated[name] = load_json_gh_asset(asset.browser_download_url) + + return consolidated, list(missing) + + @classmethod + def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token=None) -> dict: + """Load all assets with their hashes, by release.""" + flat = {} + consolidated, _ = cls.load_all(repo=repo, token=token) + for release, data in consolidated.items(): + for asset in data['assets'].values(): + flat_release = flat[release] = {} + for asset_name, asset_data in asset['entries'].items(): + flat_release[asset_name] = asset_data['sha256'] + + return flat + + def _get_enriched_assets_from_release(self): + """Get assets and metadata from a GitHub release.""" + assets = {} + for asset in [a.raw_data for a in self.release.get_assets()]: + zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url']) + asset.update(zipped_sha256=zipped_sha256) + + assets[asset['name']] = { + 'metadata': asset, + 'data': data + } + + return assets + + +def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]: + """Load all ML DGA model release manifests by model id.""" + manifests, _ = Manifest.load_all(repo=repo) + model_manifests = {} + + for manifest_name, manifest in manifests.items(): + for asset_name, asset in manifest['assets'].items(): + for entry_name, entry_data in asset['entries'].items(): + if entry_name.startswith('dga') and entry_name.endswith('model.json'): + model_id, _ = entry_name.rsplit('_', 1) + model_manifests[model_id] = ReleaseManifest(**manifest) + break + + return model_manifests + + class ClientError(click.ClickException): """Custom CLI error to format output or full debug stacktrace.""" diff --git a/detection_rules/rule.py b/detection_rules/rule.py index 10bb65ad2..f024d90fa 100644 --- a/detection_rules/rule.py +++ b/detection_rules/rule.py @@ -190,7 +190,7 @@ class Rule(object): schema_cls.validate(contents, role=self.type) - skip_query_validation = self.metadata['maturity'] == 'development' and \ + skip_query_validation = self.metadata['maturity'] in ('experimental', 'development') and \ self.metadata.get('query_schema_validation') is False if query and self.query is not None and not skip_query_validation: diff --git a/detection_rules/rule_loader.py b/detection_rules/rule_loader.py index 57c46da6f..2528d452a 100644 --- a/detection_rules/rule_loader.py +++ b/detection_rules/rule_loader.py @@ -187,6 +187,7 @@ rta_mappings = RtaMappings() __all__ = ( "load_rule_files", "load_rules", + "load_rule_files", "get_file_name", "get_production_rules", "get_rule", diff --git a/detection_rules/schemas/base.py b/detection_rules/schemas/base.py index 40ed6c4f0..d655c9b64 100644 --- a/detection_rules/schemas/base.py +++ b/detection_rules/schemas/base.py @@ -13,8 +13,8 @@ from ..utils import cached DATE_PATTERN = r'\d{4}/\d{2}/\d{2}' -MATURITY_LEVELS = ['development', 'testing', 'staged', 'production', 'deprecated'] -OS_OPTIONS = ['windows', 'linux', 'macos', 'solaris'] # need to verify with ecs +MATURITY_LEVELS = ['development', 'experimental', 'beta', 'production', 'deprecated'] +OS_OPTIONS = ['windows', 'linux', 'macos', 'solaris'] UUID_PATTERN = r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' VERSION_PATTERN = r'\d+\.\d+\.\d+' diff --git a/docs/ML_DGA.md b/docs/ML_DGA.md new file mode 100644 index 000000000..4f8555519 --- /dev/null +++ b/docs/ML_DGA.md @@ -0,0 +1,176 @@ +# Machine Learning on Domain Generation Algorithm (DGA) + +Several blogs were put out on how you can create and leverage supervised DGA ML models to enrich data within the stack. +* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity) +* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data) + +You can also find some supplementary and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection) + +For questions, please reach out to the ML team in the #machine-learning channel of the +[Elastic public slack channel](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack) + +They can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning) + +*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)* +*Note: the ML features are considered experimental in Kibana as well as this rules CLI* + +## Releases + +Models and dependencies will be [released](https://github.com/elastic/detection-rules/releases) as `ML-DGA-YYYMMDD-N`. +This tag name is what will need to be passed to the CLI command. + +## Uploading a model and dependencies using the CLI + +### Usage + +```console +python -m detection_rules es experimental setup-dga-model -h + +Elasticsearch client: +Options: + -u, --elasticsearch-url TEXT + --cloud-id TEXT + -u, --user TEXT + -p, --es-password TEXT + -t, --timeout INTEGER Timeout for elasticsearch client + +Usage: detection_rules es experimental setup-dga-model [OPTIONS] + + Upload DGA model and enrich DNS data. + +Options: + -t, --model-tag TEXT Release tag for model files staged in detection- + rules (required to download files) + -d, --model-dir DIRECTORY Directory containing local model files + --overwrite Overwrite all files if already in the stack + -h, --help Show this message and exit. +``` + +### Detailed steps + +#### 1. Upload and setup the model file and dependencies + +Run `python -m detection_rules es experimental setup-dga-model -t ` + +*If updating a new model, you should first uninstall any existing models using `remove-dga-model`* + +You can also upload files locally using the `-d` option, so long as the naming convention of the files match the +expected pattern for the filenames. + +#### 2. Update packetbeat configuration + +You will need to update your packebeat.yml config file to point to the enrichment pipeline + +Under `Elasticsearch Output` add the following: + +```yaml +output.elasticsearch: + hosts: ["your-hostname:your-port"] + pipeline: dns_enrich_pipeline +``` + +#### 3. Refresh your packetbeat index + +You can optionally choose to refresh your packetbeat index mapping within Kibana: +* navigate to `Stack Management > (Kibana) Index Patterns` +* select the applicable packetbeat index +* click `refresh field list` + +#### 4. Verify enrichment fields + +Any packetbeat documents with the field `dns.question.registered_domain` should now have the enriched data: +`ml_is_dga.*` + + +## Experimental DGA ML Jobs and Rules + +Once packetbeat data is being enriched, there are some rules and ML jobs which can leverage the enriched fields. +The experimental rules and jobs will be staged separate from the model bundle under the [releases](https://github.com/elastic/detection-rules/releases) +as `ML-experimental-detections-YYYMMDD-N`. + +Note that if a rule is of `type = "machine_learning"`, then it may be dependent on a uploading and running a machine +learning job first. If this is the case, it will likely be annotated within the `note` field of the rule. + +#### Uploading rules + +You can then individually upload these rules using the [kibana upload-rule](../CLI.md#uploading-rules-to-kibana) command + +#### Uploading ML Jobs + +Unzip released jobs and then run `python -m detection_rules es experimental upload-ml-job ` + +To delete a job, run `python -m detection_rules es experimental delete-ml-job ` + +Take note of any errors as some jobs may have dependencies on each other which may require stopping and or removing +referenced jobs first. + + +## For Maintainers + +### Validating release bundles and releasing + +Release assets are expected to be in certain formats with specific naming patterns and json structures. + +#### Filename patterns + +DGA model file naming convention should match the following patterns + +```json +{ + "model": "dga_*_model.json", + "dga_ngrams_create": "dga_*_ngrams_create.json", + "dga_ngrams_transform_delete": "dga_*_ngrams_transform_delete.json", + "dns_enrich_pipeline": "dga_*_ingest_pipeline1.json", + "dns_dga_inference_enrich_pipeline": "dga_*_ingest_pipeline2.json" +} +``` + +Experimental detections do not have to match a specific naming pattern but should be in the following file formats: +* rules: toml +* jobs: json + +#### Uniqueness + +The model file name and hash should be unique or else it will raise a warning in validation. This is important to allow +distinction and ascertain information about a bundle by consulting the manifest, based on a unique name + +Release zipped assets, name, and tag name all share the same name. These should follow the following format: +* Model releases: `ML-DGA-YYYYMMDD-1` +* Detection releases: `ML-experimental-detections-YYYYMMDD-1` + +The trailing digit should be incremented for each release + +Rule and Job names should also be unique + +#### Rule and job structure + +Rules files are only check if they are valid toml, nothing more. Consult existing production rules and schemas for API +expectations + +Job files are checked if they are valid toml and contain the following top level fields: +* name - job name +* type - job type +* body - the actual ML job data. The contents are not validated + +#### Validation + +All of these checks are automated and can be called with: +`python -m detection-rules dev gh-release validate-ml-dga-asset` - for model bundles +`python -m detection-rules dev gh-release validate-ml-detections-asset` for rule/job bundles + +Pay attention to the output to determine any necessary changes. This may not be all inclusive and actual testing on a +live stack should always occur even with passing validation before saving to a GitHub release + +#### Releasing + +Install dependencies with `pip install -r requirements-dev.txt` + +A release can be created via the cli using `python -m detection-rules dev gh-release create-ml` + +* you can only use a github token +* the base directory name and release name must match +* you must have write permissions to the repo to create a release +* validation also occurs on this, with a prompt to proceed +* upon completion, a manifest is uploaded as an asset to the GitHub release + +To test, you can fork the repo and use `--repo ` to validate a release is working as expected diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..635cde858 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +PyGithub==1.53