From 0ec8d67e78b069aa190297e8007072c2a2fc294e Mon Sep 17 00:00:00 2001 From: Justin Ibarra Date: Wed, 2 Jun 2021 20:37:12 -0800 Subject: [PATCH] Refactor experimental ML CLI and code (#1218) * move github and ml to their own files * refactor release and ml commands * update ML readmes * add unzip_to_dict function * prompt for model ID in remove-model * update experimental rule upload process * update remove-scripts-pipelines to take multiple options Co-authored-by: Ross Wolf <31489089+rw-access@users.noreply.github.com> Co-authored-by: Apoorva --- .gitignore | 2 + detection_rules/__init__.py | 4 + detection_rules/cli_utils.py | 3 +- detection_rules/devtools.py | 247 +--------- detection_rules/eswrap.py | 318 +----------- detection_rules/ghwrap.py | 295 ++++++++++++ detection_rules/misc.py | 299 +----------- detection_rules/ml.py | 452 ++++++++++++++++++ detection_rules/schemas/definitions.py | 5 + detection_rules/utils.py | 19 + docs/ML_DGA.md | 191 -------- docs/experimental-machine-learning/DGA.md | 54 +++ .../experimental-detections.md | 28 ++ .../problem-child.md | 64 +++ docs/experimental-machine-learning/readme.md | 102 ++++ 15 files changed, 1039 insertions(+), 1044 deletions(-) create mode 100644 detection_rules/ghwrap.py create mode 100644 detection_rules/ml.py delete mode 100644 docs/ML_DGA.md create mode 100644 docs/experimental-machine-learning/DGA.md create mode 100644 docs/experimental-machine-learning/experimental-detections.md create mode 100644 docs/experimental-machine-learning/problem-child.md create mode 100644 docs/experimental-machine-learning/readme.md diff --git a/.gitignore b/.gitignore index 36d4736ef..f0fc34cb8 100644 --- a/.gitignore +++ b/.gitignore @@ -107,9 +107,11 @@ ENV/ /_extras/ # detection rules +.detection-rules-cfg.json releases/ collections/ enriched-rule-indexes/ exports/ ML-models/ surveys/ +machine-learning/ diff --git a/detection_rules/__init__.py b/detection_rules/__init__.py index c867dec4e..84042de12 100644 --- a/detection_rules/__init__.py +++ b/detection_rules/__init__.py @@ -12,9 +12,11 @@ from . import ( # noqa: E402 devtools, docs, eswrap, + ghwrap, kbwrap, main, mappings, + ml, misc, rule_formatter, rule_loader, @@ -26,10 +28,12 @@ __all__ = ( 'devtools', 'docs', 'eswrap', + 'ghwrap', 'kbwrap', 'mappings', "main", 'misc', + 'ml', 'rule_formatter', 'rule_loader', 'schemas', diff --git a/detection_rules/cli_utils.py b/detection_rules/cli_utils.py index 7665a0a17..71702819e 100644 --- a/detection_rules/cli_utils.py +++ b/detection_rules/cli_utils.py @@ -67,14 +67,13 @@ def multi_collection(f): @click.option('--rule-id', '-id', multiple=True, required=False) @functools.wraps(f) def get_collection(*args, **kwargs): - rule_name: List[str] = kwargs.pop("rule_name", []) rule_id: List[str] = kwargs.pop("rule_id", []) rule_files: List[str] = kwargs.pop("rule_file") directories: List[str] = kwargs.pop("directory") rules = RuleCollection() - if not (rule_name or rule_id or rule_files): + if not (directories or rule_id or rule_files): client_error('Required: at least one of --rule-id, --rule-file, or --directory') rules.load_files(Path(p) for p in rule_files) diff --git a/detection_rules/devtools.py b/detection_rules/devtools.py index 77dff79ef..4ecd16257 100644 --- a/detection_rules/devtools.py +++ b/detection_rules/devtools.py @@ -6,7 +6,6 @@ """CLI commands for internal detection_rules dev team.""" import dataclasses import functools -import hashlib import io import json import os @@ -25,10 +24,11 @@ from kibana.connector import Kibana from . import rule_loader from .cli_utils import single_collection from .eswrap import CollectEvents, add_range_to_dsl +from .ghwrap import GithubClient from .main import root -from .misc import GithubClient, Manifest, PYTHON_LICENSE, add_client, client_error, getdefault +from .misc import PYTHON_LICENSE, add_client, client_error from .packaging import PACKAGE_FILE, Package, RELEASE_DIR, current_stack_version, manage_versions -from .rule import QueryRuleData, TOMLRule +from .rule import AnyRuleData, BaseRuleData, QueryRuleData, TOMLRule from .rule_loader import RuleCollection, production_filter from .utils import dict_hash, get_path, load_dump @@ -417,6 +417,14 @@ def deprecate_rule(ctx: click.Context, rule_file: str): click.echo(f'Rule moved to {deprecated_path} - remember to git add this file') +@dev_group.command("update-schemas") +def update_schemas(): + classes = [BaseRuleData] + list(typing.get_args(AnyRuleData)) + + for cls in classes: + cls.save_schema() + + @dev_group.group('test') def test_group(): """Commands for testing against stack resources.""" @@ -557,236 +565,3 @@ def rule_survey(ctx: click.Context, query, date_range, dump_file, hide_zero_coun json.dump(details, f, indent=2, sort_keys=True) return survey_results - - -@dev_group.group('gh-release') -def gh_release_group(): - """Commands to manage GitHub releases.""" - - -@gh_release_group.command('create-ml') -@click.argument('directory', type=click.Path(dir_okay=True, file_okay=False)) -@click.option('--gh-token', '-t', default=getdefault('gh_token')) -@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo') -@click.option('--release-name', '-n', required=True, help='Name of release') -@click.option('--description', '-d', help='Description of release to append to default message') -@click.pass_context -def create_ml_release(ctx, directory, gh_token, repo, release_name, description): - """Create a GitHub release.""" - import re - - # ML-DGA-20201129-25 - pattern = r'^(ML-DGA|ML-experimental-detections)-\d{4}\d{2}\d{2}-\d+$' - assert re.match(pattern, release_name), f'release name must match pattern: {pattern}' - assert Path(directory).name == release_name, f'directory name must match release name: {release_name}' - - gh_token = gh_token or click.prompt('GitHub token', hide_input=True) - client = GithubClient(gh_token) - gh_repo = client.authenticated_client.get_repo(repo) - - # validate tag name is increment by 1 - name_prefix, _, version = release_name.rsplit('-', 2) - version = int(version) - releases = gh_repo.get_releases() - max_ver = max([int(r.raw_data['name'].split('-')[-1]) for r in releases - if r.raw_data['name'].startswith(name_prefix)], default=0) - - if version != (max_ver + 1): - client_error(f'Last release version was {max_ver}. Release name should end with version: {max_ver + 1}') - - # validate files - if name_prefix == 'ML-DGA': - zipped_bundle, description_str = ctx.invoke(validate_ml_dga_asset, directory=directory, repo=repo) - else: - zipped_bundle, description_str = ctx.invoke(validate_ml_detections_asset, directory=directory) - - click.confirm('Validation passed, verify output. Continue?') - - if description: - description_str = f'{description_str}\n\n----\n\n{description}' - - release = gh_repo.create_git_release(name=release_name, tag=release_name, message=description_str) - zip_name = Path(zipped_bundle).name - - click.echo(f'release created at: {release.html_url}') - - # add zipped bundle as an asset to the release - click.echo(f'Uploading zip file: {zip_name}') - release.upload_asset(zipped_bundle, label=zip_name, name=zip_name, content_type='application/zip') - - # create manifest entry - click.echo('creating manifest for release') - manifest = Manifest(repo, tag_name=release_name, token=gh_token) - manifest.save() - - return release - - -@gh_release_group.command('validate-ml-dga-asset') -@click.argument('directory', type=click.Path(exists=True, file_okay=False)) -@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo') -def validate_ml_dga_asset(directory, repo): - """"Validate and prep an ML DGA bundle for release.""" - from .eswrap import expected_ml_dga_patterns - - now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) - - files = list(Path(directory).glob('*')) - if len(files) > 5: - client_error('Too many files, expected 5') - - click.secho('[*] validated expected number of files', fg='green') - - # backup files - will re-save sorted to have deterministic hash - backup_path = Path(directory).resolve().parent.joinpath(f'backups-{Path(directory).name}-{now.replace(":", "-")}') - shutil.copytree(directory, backup_path) - - # validate file names and json and load - loaded_contents = {} - for name, pattern in expected_ml_dga_patterns.items(): - path = list(Path(directory).glob(pattern)) - match_count = len(path) - if match_count != 1: - client_error(f'Expected filename pattern "{pattern}" for "{name}": {match_count} matches detected') - - file_path = path[0] - try: - with open(file_path, 'r') as f: - contents = json.dumps(json.load(f), sort_keys=True) - loaded_contents[name] = {'contents': contents, 'filename': file_path} - - sha256 = hashlib.sha256(contents.encode('utf-8')).hexdigest() - click.secho(f' - sha256: {sha256} - {name}') - - # re-save sorted - with open(file_path, 'w') as f: - f.write(contents) - except json.JSONDecodeError as e: - client_error(f'Invalid JSON in {file_path} file', e) - - model_filename = Path(loaded_contents['model']['filename']).name - model_name, _ = model_filename.rsplit('_', maxsplit=1) - - click.secho('[*] re-saved all files with keys sorted for deterministic hashing', fg='green') - click.secho(f' [+] backups saved to: {backup_path}') - click.secho('[*] validated expected naming patterns for all files', fg='green') - click.secho('[*] validated json formatting of all files', fg='green') - - # check manifest for existing things - existing_sha = False - existing_model_name = False - model_hash = hashlib.sha256(loaded_contents['model']['contents'].encode('utf-8')).hexdigest() - manifest_hashes = Manifest.get_existing_asset_hashes(repo) - - for release, file_data in manifest_hashes.items(): - for file_name, sha in file_data.items(): - if model_hash == sha: - existing_sha = True - click.secho(f'[!] hash for model file: "{loaded_contents["model"]["filename"]}" matches: ' - f'{release} -> {file_name} -> {sha}', fg='yellow') - - if model_filename == file_name: - existing_model_name = True - client_error(f'name for model file: "{loaded_contents["model"]["filename"]}" matches: ' - f'{release} -> {file_name} -> {file_name}') - - if not existing_sha: - click.secho(f'[+] validated no existing models matched hashes for: ' - f'{loaded_contents["model"]["filename"]}', fg='green') - - if not existing_model_name: - click.secho(f'[+] validated no existing models matched names for: ' - f'{loaded_contents["model"]["filename"]}', fg='green') - - # save zip - zip_name_no_ext = Path(directory).resolve() - zip_name = f'{zip_name_no_ext}.zip' - shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name) - click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green') - - click.secho(f'[!] run `setup-dga-model -d {directory}` to test this on a live stack before releasing', fg='yellow') - - description = { - 'model_name': model_name + '\n\n----\n\n', - 'date': now, - 'model_sha256': model_hash, - 'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md' - } - description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()]) - click.echo() - click.echo(f'[*] description to paste with release:\n\n{description_str}\n') - - return zip_name, description_str - - -@gh_release_group.command('validate-ml-detections-asset') -@click.argument('directory', type=click.Path(exists=True, file_okay=False)) -def validate_ml_detections_asset(directory): - """Validate and prep ML detection rules and jobs before release.""" - import pytoml - - now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) - - all_files = list(Path(directory).glob('*')) - job_paths = [f for f in all_files if f.suffix == '.json'] - rule_paths = [f for f in all_files if f.suffix == '.toml'] - other_paths = [f for f in Path(directory).glob('*') if f.suffix not in ('.toml', '.json')] - job_count = len(job_paths) - rule_count = len(rule_paths) - other_count = len(other_paths) - - if 'readme.md' not in [f.name.lower() for f in other_paths]: - client_error('Release is missing readme file') - - for job in job_paths: - try: - with open(job, 'r') as f: - j = json.load(f) - assert j.get('name'), click.style(f'[!] job file "{job}" missing: name', fg='red') - assert j.get('type'), click.style(f'[!] job file "{job}" missing: type', fg='red') - assert j.get('body'), click.style(f'[!] job file "{job}" missing: body', fg='red') - except json.JSONDecodeError as e: - client_error(f'Invalid JSON in {job} file', e) - - click.secho(f'[*] validated json formatting and required fields in {job_count} job files', fg='green') - - for rule in rule_paths: - with open(rule, 'r') as f: - try: - pytoml.load(f) - except pytoml.TomlError as e: - client_error(f'[!] invalid rule toml for: {rule}', e) - - click.secho(f'[*] validated toml formatting for {rule_count} rule files', fg='green') - - # save zip - zip_name_no_ext = Path(directory).resolve() - zip_name = f'{zip_name_no_ext}.zip' - shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name) - click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green') - - click.secho('[!] run `kibana upload-rule` to test rules on a live stack before releasing', fg='green') - click.secho('[!] run `es upload-ml-job` to test jobs on a live stack before releasing', fg='green') - - description = { - 'Experimental rules': rule_count, - 'Experimental ML jobs': job_count, - 'Other files': str(other_count) + '\n\n----\n\n', - 'DGA release': '', - 'date': now, - 'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md' - } - description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()]) - click.echo() - click.echo(f'description to paste with release:\n\n{description_str}\n') - - return zip_name, description_str - - -@dev_group.command("update-schemas") -def update_schemas(): - from .rule import BaseRuleData, AnyRuleData - classes = [BaseRuleData] + list(typing.get_args(AnyRuleData)) - - for cls in classes: - cls.save_schema() diff --git a/detection_rules/eswrap.py b/detection_rules/eswrap.py index c9a019d1d..b6e610f01 100644 --- a/detection_rules/eswrap.py +++ b/detection_rules/eswrap.py @@ -8,14 +8,12 @@ import json import os import time from collections import defaultdict -from contextlib import contextmanager -from pathlib import Path from typing import Union import click import elasticsearch from elasticsearch import Elasticsearch -from elasticsearch.client import AsyncSearchClient, IngestClient, LicenseClient, MlClient +from elasticsearch.client import AsyncSearchClient import kql from .main import root @@ -422,317 +420,3 @@ def index_repo(ctx: click.Context, query, from_file, save_files): def es_experimental(): """[Experimental] helper commands for integrating with Elasticsearch.""" click.secho('\n* experimental commands are use at your own risk and may change without warning *\n') - - -@es_experimental.command('check-model-files') -@click.pass_context -def check_model_files(ctx): - """Check ML model files on an elasticsearch instance.""" - from elasticsearch.client import IngestClient, MlClient - from .misc import get_ml_model_manifests_by_model_id - - es_client: Elasticsearch = ctx.obj['es'] - ml_client = MlClient(es_client) - ingest_client = IngestClient(es_client) - - def safe_get(func, arg): - try: - return func(arg) - except elasticsearch.NotFoundError: - return None - - models = [m for m in ml_client.get_trained_models().get('trained_model_configs', []) - if m['created_by'] != '_xpack'] - - if models: - if len([m for m in models if m['model_id'].startswith('dga_')]) > 1: - click.secho('Multiple DGA models detected! It is not recommended to run more than one DGA model at a time', - fg='yellow') - - manifests = get_ml_model_manifests_by_model_id() - - click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:') - for model in models: - manifest = manifests.get(model['model_id']) - click.echo(f' - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}') - else: - click.echo('No DGA Models found') - - support_files = { - 'create_script': safe_get(es_client.get_script, 'dga_ngrams_create'), - 'delete_script': safe_get(es_client.get_script, 'dga_ngrams_transform_delete'), - 'enrich_pipeline': safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'), - 'inference_pipeline': safe_get(ingest_client.get_pipeline, 'dns_dga_inference_enrich_pipeline') - } - - click.echo('Support Files:') - for support_file, results in support_files.items(): - click.echo(f' - {support_file}: {"found" if results else "not found"}') - - -@es_experimental.command('remove-dga-model') -@click.argument('model-id') -@click.option('--force', '-f', is_flag=True, help='Force the attempted delete without checking if model exists') -@click.pass_context -def remove_dga_model(ctx, model_id, force, es_client: Elasticsearch = None, ml_client: MlClient = None, - ingest_client: IngestClient = None): - """Remove ML DGA files.""" - from elasticsearch.client import IngestClient, MlClient - - es_client = es_client or ctx.obj['es'] - ml_client = ml_client or MlClient(es_client) - ingest_client = ingest_client or IngestClient(es_client) - - def safe_delete(func, fid, verbose=True): - try: - func(fid) - except elasticsearch.NotFoundError: - return False - if verbose: - click.echo(f' - {fid} deleted') - return True - - model_exists = False - if not force: - existing_models = ml_client.get_trained_models() - model_exists = model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])] - - if model_exists or force: - if model_exists: - click.secho('[-] Existing model detected - deleting files', fg='yellow') - - deleted = [ - safe_delete(ingest_client.delete_pipeline, 'dns_dga_inference_enrich_pipeline'), - safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'), - safe_delete(es_client.delete_script, 'dga_ngrams_transform_delete'), - # f'{model_id}_dga_ngrams_transform_delete' - safe_delete(es_client.delete_script, 'dga_ngrams_create'), - # f'{model_id}_dga_ngrams_create' - safe_delete(ml_client.delete_trained_model, model_id) - ] - - if not any(deleted): - click.echo('No files deleted') - else: - click.echo(f'Model: {model_id} not found') - - -expected_ml_dga_patterns = { - 'model': 'dga_*_model.json', # noqa: E241 - 'dga_ngrams_create': 'dga_*_ngrams_create.json', # noqa: E241 - 'dga_ngrams_transform_delete': 'dga_*_ngrams_transform_delete.json', # noqa: E241 - 'dns_enrich_pipeline': 'dga_*_ingest_pipeline1.json', # noqa: E241 - 'dns_dga_inference_enrich_pipeline': 'dga_*_ingest_pipeline2.json' # noqa: E241 -} - - -@es_experimental.command('setup-dga-model') -@click.option('--model-tag', '-t', - help='Release tag for model files staged in detection-rules (required to download files)') -@click.option('--repo', '-r', default='elastic/detection-rules', - help='GitHub repository hosting the model file releases (owner/repo)') -@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False), - help='Directory containing local model files') -@click.option('--overwrite', is_flag=True, help='Overwrite all files if already in the stack') -@click.pass_context -def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite): - """Upload ML DGA model and dependencies and enrich DNS data.""" - import io - import requests - import shutil - import zipfile - - es_client: Elasticsearch = ctx.obj['es'] - client_info = es_client.info() - license_client = LicenseClient(es_client) - - if license_client.get()['license']['type'].lower() not in ('platinum', 'enterprise'): - client_error('You must have a platinum or enterprise subscription in order to use these ML features') - - # download files if necessary - if not model_dir: - if not model_tag: - client_error('model-tag or model-dir required to download model files') - - click.echo(f'Downloading artifact: {model_tag}') - - release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}' - release = requests.get(release_url) - release.raise_for_status() - assets = [a for a in release.json()['assets'] if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')] - - if len(assets) != 1: - client_error(f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!') - - zipped_url = assets[0]['browser_download_url'] - zipped = requests.get(zipped_url) - z = zipfile.ZipFile(io.BytesIO(zipped.content)) - - dga_dir = get_path('ML-models', 'DGA') - model_dir = os.path.join(dga_dir, model_tag) - os.makedirs(dga_dir, exist_ok=True) - shutil.rmtree(model_dir, ignore_errors=True) - z.extractall(dga_dir) - click.echo(f'files saved to {model_dir}') - - # read files as needed - z.close() - - def get_model_filename(pattern): - paths = list(Path(model_dir).glob(pattern)) - if not paths: - client_error(f'{model_dir} missing files matching the pattern: {pattern}') - if len(paths) > 1: - client_error(f'{model_dir} contains multiple files matching the pattern: {pattern}') - - return paths[0] - - @contextmanager - def open_model_file(name): - pattern = expected_ml_dga_patterns[name] - with open(get_model_filename(pattern), 'r') as f: - yield json.load(f) - - model_id, _ = os.path.basename(get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1) - - click.echo(f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})') - - # upload model - ml_client = MlClient(es_client) - ingest_client = IngestClient(es_client) - - existing_models = ml_client.get_trained_models() - if model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]: - if overwrite: - ctx.invoke(remove_dga_model, model_id=model_id, es_client=es_client, ml_client=ml_client, - ingest_client=ingest_client, force=True) - else: - client_error(f'Model: {model_id} already exists on stack! Try --overwrite to force the upload') - - click.secho('[+] Uploading model (may take a while)') - - with open_model_file('model') as model_file: - try: - ml_client.put_trained_model(model_id=model_id, body=model_file) - except elasticsearch.ConnectionTimeout: - msg = 'Connection timeout, try increasing timeout using `es --timeout experimental setup_dga_model`.' - client_error(msg) - - # install scripts - click.secho('[+] Uploading painless scripts') - - with open_model_file('dga_ngrams_create') as painless_install: - es_client.put_script(id='dga_ngrams_create', body=painless_install) - # f'{model_id}_dga_ngrams_create' - - with open_model_file('dga_ngrams_transform_delete') as painless_delete: - es_client.put_script(id='dga_ngrams_transform_delete', body=painless_delete) - # f'{model_id}_dga_ngrams_transform_delete' - - # Install ingest pipelines - click.secho('[+] Uploading pipelines') - - def _build_es_script_error(err, pipeline_file): - error = err.info['error'] - cause = error['caused_by'] - - error_msg = [ - f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}', - ' '.join(f'{k}: {v}' for k, v in error['position'].items()), - '\n'.join(error['script_stack']) - ] - - return click.style('\n'.join(error_msg), fg='red') - - with open_model_file('dns_enrich_pipeline') as ingest_pipeline1: - try: - ingest_client.put_pipeline(id='dns_enrich_pipeline', body=ingest_pipeline1) - except elasticsearch.RequestError as e: - if e.error == 'script_exception': - client_error(_build_es_script_error(e, 'ingest_pipeline1'), e, ctx=ctx) - else: - raise - - with open_model_file('dns_dga_inference_enrich_pipeline') as ingest_pipeline2: - try: - ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline', body=ingest_pipeline2) - except elasticsearch.RequestError as e: - if e.error == 'script_exception': - client_error(_build_es_script_error(e, 'ingest_pipeline2'), e, ctx=ctx) - else: - raise - - click.echo('Ensure that you have updated your packetbeat.yml config file.') - click.echo(' - reference: ML_DGA.md #2-update-packetbeat-configuration') - click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo') - click.echo('To upload rules, run: kibana upload-rule ') - click.echo('To upload ML jobs, run: es experimental upload-ml-job ') - - -@es_experimental.command('upload-ml-job') -@click.argument('job-file', type=click.Path(exists=True, dir_okay=False)) -@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name') -@click.pass_context -def upload_ml_job(ctx: click.Context, job_file, overwrite): - """Upload experimental ML jobs.""" - es_client: Elasticsearch = ctx.obj['es'] - ml_client = MlClient(es_client) - - with open(job_file, 'r') as f: - job = json.load(f) - - def safe_upload(func): - try: - func(name, body) - except (elasticsearch.ConflictError, elasticsearch.RequestError) as err: - if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception': - client_error(str(err), err, ctx=ctx) - - if overwrite: - ctx.invoke(delete_ml_job, job_name=name, job_type=job_type) - func(name, body) - else: - client_error(str(err), err, ctx=ctx) - - try: - job_type = job['type'] - name = job['name'] - body = job['body'] - - if job_type == 'anomaly_detection': - safe_upload(ml_client.put_job) - elif job_type == 'data_frame_analytic': - safe_upload(ml_client.put_data_frame_analytics) - elif job_type == 'datafeed': - safe_upload(ml_client.put_datafeed) - else: - client_error(f'Unknown ML job type: {job_type}') - - click.echo(f'Uploaded {job_type} job: {name}') - except KeyError as e: - client_error(f'{job_file} missing required info: {e}') - - -@es_experimental.command('delete-ml-job') -@click.argument('job-name') -@click.argument('job-type') -@click.pass_context -def delete_ml_job(ctx: click.Context, job_name, job_type, verbose=True): - """Remove experimental ML jobs.""" - es_client: Elasticsearch = ctx.obj['es'] - ml_client = MlClient(es_client) - - try: - if job_type == 'anomaly_detection': - ml_client.delete_job(job_name) - elif job_type == 'data_frame_analytic': - ml_client.delete_data_frame_analytics(job_name) - elif job_type == 'datafeed': - ml_client.delete_datafeed(job_name) - else: - client_error(f'Unknown ML job type: {job_type}') - except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e: - client_error(str(e), e, ctx=ctx) - - if verbose: - click.echo(f'Deleted {job_type} job: {job_name}') diff --git a/detection_rules/ghwrap.py b/detection_rules/ghwrap.py new file mode 100644 index 000000000..a9aa4f60e --- /dev/null +++ b/detection_rules/ghwrap.py @@ -0,0 +1,295 @@ +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0; you may not use this file except in compliance with the Elastic License +# 2.0. + +"""Schemas and dataclasses for GitHub releases.""" + +import dataclasses +import hashlib +import io +import json +import shutil +import time +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Dict, Optional, Tuple +from zipfile import ZipFile + +import click +import requests + +from .schemas import definitions + +# this is primarily for type hinting - all use of the github client should come from GithubClient class +try: + from github import Github + from github.Repository import Repository + from github.GitRelease import GitRelease + from github.GitReleaseAsset import GitReleaseAsset +except ImportError: + # for type hinting + Github = None # noqa: N806 + Repository = None # noqa: N806 + GitRelease = None # noqa: N806 + GitReleaseAsset = None # noqa: N806 + + +def get_gh_release(repo: Repository, release_name: Optional[str] = None, tag_name: Optional[str] = None) -> GitRelease: + """Get a list of GitHub releases by repo.""" + assert release_name or tag_name, 'Must specify a release_name or tag_name' + + releases = repo.get_releases() + for release in releases: + if release_name and release_name == release.title: + return release + elif tag_name and tag_name == release.tag_name: + return release + + +def load_zipped_gh_assets_with_metadata(url: str) -> Tuple[str, dict]: + """Download and unzip a GitHub assets.""" + response = requests.get(url) + zipped_asset = ZipFile(io.BytesIO(response.content)) + zipped_sha256 = hashlib.sha256(response.content).hexdigest() + + assets = {} + for zipped in zipped_asset.filelist: + if zipped.is_dir(): + continue + + contents = zipped_asset.read(zipped.filename) + sha256 = hashlib.sha256(contents).hexdigest() + + assets[zipped.filename] = { + 'contents': contents, + 'metadata': { + 'compress_size': zipped.compress_size, + # zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday + 'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)), + 'sha256': sha256, + 'size': zipped.file_size, + } + } + + return zipped_sha256, assets + + +def load_json_gh_asset(url: str) -> dict: + """Load and return the contents of a json asset file.""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def download_gh_asset(url: str, path: str, overwrite=False): + """Download and unzip a GitHub asset.""" + zipped = requests.get(url) + z = ZipFile(io.BytesIO(zipped.content)) + + Path(path).mkdir(exist_ok=True) + if overwrite: + shutil.rmtree(path, ignore_errors=True) + + z.extractall(path) + click.echo(f'files saved to {path}') + + z.close() + + +class GithubClient: + """GitHub client wrapper.""" + + def __init__(self, token: Optional[str] = None): + """Get an unauthenticated client, verified authenticated client, or a default client.""" + if not Github: + raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`') + + self.client: Github = Github(token) + self.unauthenticated_client = Github() + self.__token = token + self.__authenticated_client = None + + @property + def authenticated_client(self) -> Github: + if not self.__token: + raise ValueError('Token not defined! Re-instantiate with a token or use add_token method') + if not self.__authenticated_client: + self.__authenticated_client = Github(self.__token) + return self.__authenticated_client + + def add_token(self, token): + self.__token = token + + +@dataclass +class AssetManifestEntry: + + compress_size: int + created_at: datetime + name: str + sha256: str + size: int + + +@dataclass +class AssetManifestMetadata: + + relative_url: str + entries: Dict[str, AssetManifestEntry] + zipped_sha256: definitions.Sha256 + created_at: datetime = field(default_factory=datetime.utcnow) + description: Optional[str] = None # populated by GitHub release asset label + + +@dataclass +class ReleaseManifest: + + assets: Dict[str, AssetManifestMetadata] + assets_url: str + author: str # parsed from GitHub release metadata as: author[login] + created_at: str + html_url: str + id: int + name: str + published_at: str + url: str + zipball_url: str + tag_name: str = None + description: str = None # parsed from GitHub release metadata as: body + + +class ManifestManager: + """Manifest handler for GitHub releases.""" + + def __init__(self, repo: str = 'elastic/detection-rules', release_name: Optional[str] = None, + tag_name: Optional[str] = None, token: Optional[str] = None): + self.repo_name = repo + self.release_name = release_name + self.tag_name = tag_name + self.gh_client = GithubClient(token) + self.has_token = token is not None + + self.repo: Repository = self.gh_client.client.get_repo(repo) + self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name) + + if not self.release: + raise ValueError(f'No release found for {tag_name or release_name}') + + if not self.release_name: + self.release_name = self.release.title + + self.manifest_name = f'manifest-{self.release_name}.json' + self.assets: dict = self._get_enriched_assets_from_release() + self.release_manifest = self._create() + self.__release_manifest_dict = dataclasses.asdict(self.release_manifest) + self.manifest_size = len(json.dumps(self.__release_manifest_dict)) + + @property + def release_manifest_fl(self) -> io.BytesIO: + return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8')) + + def _create(self) -> ReleaseManifest: + """Create the manifest from GitHub asset metadata and file contents.""" + assets = {} + for asset_name, asset_data in self.assets.items(): + entries = {} + data = asset_data['data'] + metadata = asset_data['metadata'] + + for file_name, file_data in data.items(): + file_metadata = file_data['metadata'] + + name = Path(file_name).name + file_metadata.update(name=name) + + entry = AssetManifestEntry(**file_metadata) + entries[name] = entry + + assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries, + metadata['zipped_sha256'], metadata['created_at'], + metadata['label']) + + release_metadata = self._parse_release_metadata() + release_metadata.update(assets=assets) + release_manifest = ReleaseManifest(**release_metadata) + + return release_manifest + + def _parse_release_metadata(self) -> dict: + """Parse relevant info from GitHub metadata for release manifest.""" + ignore = ['assets'] + manual_set_keys = ['author', 'description'] + keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys] + parsed = {k: self.release.raw_data[k] for k in keys} + parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login']) + return parsed + + def save(self) -> GitReleaseAsset: + """Save manifest files.""" + if not self.has_token: + raise ValueError('You must provide a token to save a manifest to a GitHub release') + + asset = self.release.upload_asset_from_memory(self.release_manifest_fl, + self.manifest_size, + self.manifest_name) + click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}') + return asset + + @classmethod + def load(cls, name: str, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> Optional[dict]: + """Load a manifest.""" + gh_client = GithubClient(token) + repo = gh_client.client.get_repo(repo) + release = get_gh_release(repo, tag_name=name) + + for asset in release.get_assets(): + if asset.name == f'manifest-{name}.json': + return load_json_gh_asset(asset.browser_download_url) + + @classmethod + def load_all(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None + ) -> Tuple[Dict[str, dict], list]: + """Load a consolidated manifest.""" + gh_client = GithubClient(token) + repo = gh_client.client.get_repo(repo) + + consolidated = {} + missing = set() + for release in repo.get_releases(): + name = release.tag_name + asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None) + if not asset: + missing.add(name) + else: + consolidated[name] = load_json_gh_asset(asset.browser_download_url) + + return consolidated, list(missing) + + @classmethod + def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> dict: + """Load all assets with their hashes, by release.""" + flat = {} + consolidated, _ = cls.load_all(repo=repo, token=token) + for release, data in consolidated.items(): + for asset in data['assets'].values(): + flat_release = flat[release] = {} + for asset_name, asset_data in asset['entries'].items(): + flat_release[asset_name] = asset_data['sha256'] + + return flat + + def _get_enriched_assets_from_release(self) -> dict: + """Get assets and metadata from a GitHub release.""" + assets = {} + for asset in [a.raw_data for a in self.release.get_assets()]: + zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url']) + asset.update(zipped_sha256=zipped_sha256) + + assets[asset['name']] = { + 'metadata': asset, + 'data': data + } + + return assets diff --git a/detection_rules/misc.py b/detection_rules/misc.py index c5c86bd1c..2da9df46c 100644 --- a/detection_rules/misc.py +++ b/detection_rules/misc.py @@ -4,22 +4,14 @@ # 2.0. """Misc support.""" -import hashlib -import io import json import os import re -import shutil import time import uuid -import dataclasses -from dataclasses import dataclass, field -from datetime import datetime from functools import wraps -from pathlib import Path -from typing import Dict, NoReturn, Tuple -from zipfile import ZipFile +from typing import NoReturn import click import requests @@ -57,295 +49,6 @@ JS_LICENSE = """ """.strip().format("\n".join(' * ' + line for line in LICENSE_LINES)) -def get_gh_release(repo: Repository, release_name=None, tag_name=None) -> GitRelease: - """Get a list of GitHub releases by repo.""" - assert release_name or tag_name, 'Must specify a release_name or tag_name' - - releases = repo.get_releases() - release = next((r for r in releases - if (release_name and release_name == r.title) or (tag_name and tag_name == r.tag_name)), None) - - return release - - -def upload_gh_release_asset(token, asset: bytes, asset_name, repo=None, content_type='application/zip', - release_name=None, tag_name=None, upload_url=None): - """Save a Github relase asset.""" - if not upload_url: - assert repo, 'You must provide a repo name if not providing an upload_url' - - release = get_gh_release(repo, release_name, tag_name) - upload_url = release['upload_url'] - suffix = '{?name,label}' - upload_url = upload_url.replace(suffix, f'?name={asset_name}&label={asset_name}') - - headers = {'content-type': content_type} - r = requests.post(upload_url, auth=('', token), data=asset, headers=headers) - r.raise_for_status() - click.echo(f'Uploaded {asset_name} to release: {r.json()["url"]}') - - -def load_zipped_gh_assets_with_metadata(url) -> Tuple[str, dict]: - """Download and unzip a GitHub assets.""" - response = requests.get(url) - zipped_asset = ZipFile(io.BytesIO(response.content)) - zipped_sha256 = hashlib.sha256(response.content).hexdigest() - - assets = {} - for zipped in zipped_asset.filelist: - if zipped.is_dir(): - continue - - contents = zipped_asset.read(zipped.filename) - sha256 = hashlib.sha256(contents).hexdigest() - - assets[zipped.filename] = { - 'contents': contents, - 'metadata': { - 'compress_size': zipped.compress_size, - # zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday - 'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)), - 'sha256': sha256, - 'size': zipped.file_size, - } - } - - return zipped_sha256, assets - - -def load_json_gh_asset(url) -> dict: - """Load and return the contents of a json asset file.""" - response = requests.get(url) - response.raise_for_status() - return response.json() - - -def download_gh_asset(url, path, overwrite=False): - """Download and unzip a GitHub asset.""" - zipped = requests.get(url) - z = ZipFile(io.BytesIO(zipped.content)) - - Path(path).mkdir(exist_ok=True) - if overwrite: - shutil.rmtree(path, ignore_errors=True) - - z.extractall(path) - click.echo(f'files saved to {path}') - - z.close() - - -class GithubClient: - """GitHub client wrapper.""" - - def __init__(self, token=None): - """Get an unauthenticated client, verified authenticated client, or a default client.""" - if not Github: - raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`') - - self.client: Github = Github(token) - self.unauthenticated_client = Github() - self.__token = token - self.__authenticated_client = None - - @property - def authenticated_client(self) -> Github: - if not self.__token: - raise ValueError('Token not defined! Re-instantiate with a token or use add_token method') - if not self.__authenticated_client: - self.__authenticated_client = Github(self.__token) - return self.__authenticated_client - - def add_token(self, token): - self.__token = token - - -@dataclass -class AssetManifestEntry: - - compress_size: int - created_at: datetime - name: str - sha256: str - size: int - - -@dataclass -class AssetManifestMetadata: - - relative_url: str - entries: Dict[str, AssetManifestEntry] - zipped_sha256: str - created_at: datetime = field(default_factory=datetime.utcnow) - description: str = None # label - - -@dataclass -class ReleaseManifest: - - assets: Dict[str, AssetManifestMetadata] - assets_url: str - author: str # parsed from GitHub release metadata as: author[login] - created_at: str - html_url: str - id: int - name: str - published_at: str - url: str - zipball_url: str - tag_name: str = None - description: str = None # body - - -class Manifest: - """Manifest handler for GitHub releases.""" - - def __init__(self, repo: str = 'elastic/detection-rules', release_name=None, tag_name=None, token=None): - self.repo_name = repo - self.release_name = release_name - self.tag_name = tag_name - self.gh_client = GithubClient(token) - self.has_token = token is not None - - self.repo: Repository = self.gh_client.client.get_repo(repo) - self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name) - - if not self.release: - raise ValueError(f'No release found for {tag_name or release_name}') - - if not self.release_name: - self.release_name = self.release.title - - self.manifest_name = f'manifest-{self.release_name}.json' - self.assets: dict = self._get_enriched_assets_from_release() - self.release_manifest = self._create() - self.__release_manifest_dict = dataclasses.asdict(self.release_manifest) - self.manifest_size = len(json.dumps(self.__release_manifest_dict)) - - @property - def release_manifest_fl(self): - return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8')) - - def _create(self): - """Create the manifest from GitHub asset metadata and file contents.""" - assets = {} - for asset_name, asset_data in self.assets.items(): - entries = {} - data = asset_data['data'] - metadata = asset_data['metadata'] - - for file_name, file_data in data.items(): - file_metadata = file_data['metadata'] - - name = Path(file_name).name - file_metadata.update(name=name) - - entry = AssetManifestEntry(**file_metadata) - entries[name] = entry - - assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries, - metadata['zipped_sha256'], metadata['created_at'], - metadata['label']) - - release_metadata = self._parse_release_metadata() - release_metadata.update(assets=assets) - release_manifest = ReleaseManifest(**release_metadata) - - return release_manifest - - def _parse_release_metadata(self): - """Parse relevant info from GitHub metadata for release manifest.""" - ignore = ['assets'] - manual_set_keys = ['author', 'description'] - keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys] - parsed = {k: self.release.raw_data[k] for k in keys} - parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login']) - return parsed - - def save(self) -> GitReleaseAsset: - """Save manifest files.""" - if not self.has_token: - raise ValueError('You must provide a token to save a manifest to a GitHub release') - - asset = self.release.upload_asset_from_memory(self.release_manifest_fl, - self.manifest_size, - self.manifest_name) - click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}') - return asset - - @classmethod - def load(cls, name: str, repo: str = 'elastic/detection-rules', token=None) -> dict: - """Load a manifest.""" - gh_client = GithubClient(token) - repo = gh_client.client.get_repo(repo) - release = get_gh_release(repo, tag_name=name) - asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None) - - if asset is not None: - return load_json_gh_asset(asset.browser_download_url) - - @classmethod - def load_all(cls, repo: str = 'elastic/detection-rules', token=None) -> Tuple[Dict[str, dict], list]: - """Load a consolidated manifest.""" - gh_client = GithubClient(token) - repo = gh_client.client.get_repo(repo) - - consolidated = {} - missing = set() - for release in repo.get_releases(): - name = release.tag_name - asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None) - if not asset: - missing.add(name) - else: - consolidated[name] = load_json_gh_asset(asset.browser_download_url) - - return consolidated, list(missing) - - @classmethod - def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token=None) -> dict: - """Load all assets with their hashes, by release.""" - flat = {} - consolidated, _ = cls.load_all(repo=repo, token=token) - for release, data in consolidated.items(): - for asset in data['assets'].values(): - flat_release = flat[release] = {} - for asset_name, asset_data in asset['entries'].items(): - flat_release[asset_name] = asset_data['sha256'] - - return flat - - def _get_enriched_assets_from_release(self): - """Get assets and metadata from a GitHub release.""" - assets = {} - for asset in [a.raw_data for a in self.release.get_assets()]: - zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url']) - asset.update(zipped_sha256=zipped_sha256) - - assets[asset['name']] = { - 'metadata': asset, - 'data': data - } - - return assets - - -def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]: - """Load all ML DGA model release manifests by model id.""" - manifests, _ = Manifest.load_all(repo=repo) - model_manifests = {} - - for manifest_name, manifest in manifests.items(): - for asset_name, asset in manifest['assets'].items(): - for entry_name, entry_data in asset['entries'].items(): - if entry_name.startswith('dga') and entry_name.endswith('model.json'): - model_id, _ = entry_name.rsplit('_', 1) - model_manifests[model_id] = ReleaseManifest(**manifest) - break - - return model_manifests - - class ClientError(click.ClickException): """Custom CLI error to format output or full debug stacktrace.""" diff --git a/detection_rules/ml.py b/detection_rules/ml.py new file mode 100644 index 000000000..6b8ea821b --- /dev/null +++ b/detection_rules/ml.py @@ -0,0 +1,452 @@ +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0; you may not use this file except in compliance with the Elastic License +# 2.0. + +"""Schemas and dataclasses for experimental ML features.""" + +import io +import zipfile +from dataclasses import dataclass +from functools import cached_property, lru_cache +from pathlib import Path +from typing import Dict, List, Literal, Optional + +import click +import elasticsearch +import json +import requests +from eql.table import Table +from elasticsearch import Elasticsearch +from elasticsearch.client import IngestClient, LicenseClient, MlClient + +from .eswrap import es_experimental +from .ghwrap import ManifestManager, ReleaseManifest +from .misc import client_error +from .schemas import definitions +from .utils import get_path, unzip_to_dict + + +ML_PATH = Path(get_path('machine-learning')) + + +def info_from_tag(tag: str) -> (Literal['ml'], definitions.MachineLearningType, str, int): + try: + ml, release_type, release_date, release_number = tag.split('-') + except ValueError as exc: + raise ValueError(f'{tag} is not of valid release format: ml-type-date-number. {exc}') + + return ml, release_type, release_date, int(release_number) + + +class InvalidLicenseError(Exception): + """Invalid stack license for ML features requiring platinum or enterprise.""" + + +@dataclass +class MachineLearningClient: + """Class for experimental machine learning release clients.""" + + es_client: Elasticsearch + bundle: dict + + @cached_property + def model_id(self) -> str: + return next(data['model_id'] for name, data in self.bundle.items() if Path(name).stem.lower().endswith('model')) + + @cached_property + def bundle_type(self) -> str: + return self.model_id.split('_')[0].lower() + + @cached_property + def ml_client(self) -> MlClient: + return MlClient(self.es_client) + + @cached_property + def ingest_client(self) -> IngestClient: + return IngestClient(self.es_client) + + @cached_property + def license(self) -> str: + license_client = LicenseClient(self.es_client) + return license_client.get()['license']['type'].lower() + + @staticmethod + @lru_cache + def ml_manifests() -> Dict[str, ReleaseManifest]: + return get_ml_model_manifests_by_model_id() + + def verify_license(self): + valid_license = self.license in ('platinum', 'enterprise') + + if not valid_license: + err_msg = 'Your subscription level does not support Machine Learning. See ' \ + 'https://www.elastic.co/subscriptions for more information.' + raise InvalidLicenseError(err_msg) + + @classmethod + def from_release(cls, es_client: Elasticsearch, release_tag: str, + repo: str = 'elastic/detection-rules') -> 'MachineLearningClient': + """Load from a GitHub release.""" + full_type = '-'.join(info_from_tag(release_tag)[:2]) + release_url = f'https://api.github.com/repos/{repo}/releases/tags/{release_tag}' + release = requests.get(release_url) + release.raise_for_status() + + # check that the release only has a single zip file + assets = [a for a in release.json()['assets'] if + a['name'].startswith(full_type) and a['name'].endswith('.zip')] + assert len(assets) == 1, f'Malformed release: expected 1 {full_type} zip file, found: {len(assets)}!' + + zipped_url = assets[0]['browser_download_url'] + zipped_raw = requests.get(zipped_url) + zipped_bundle = zipfile.ZipFile(io.BytesIO(zipped_raw.content)) + bundle = unzip_to_dict(zipped_bundle) + + return cls(es_client=es_client, bundle=bundle) + + @classmethod + def from_directory(cls, es_client: Elasticsearch, directory: Path) -> 'MachineLearningClient': + """Load from an unzipped local directory.""" + bundle = json.loads(directory.read_text()) + return cls(es_client=es_client, bundle=bundle) + + def remove(self) -> dict: + """Remove machine learning files from a stack.""" + results = dict(script={}, pipeline={}, model={}) + for pipeline in list(self.get_related_pipelines()): + results['pipeline'][pipeline] = self.ingest_client.delete_pipeline(pipeline) + for script in list(self.get_related_scripts()): + results['script'][script] = self.es_client.delete_script(script) + + results['model'][self.model_id] = self.ml_client.delete_trained_model(self.model_id) + return results + + def setup(self) -> dict: + """Setup machine learning bundle on a stack.""" + self.verify_license() + results = dict(script={}, pipeline={}, model={}) + + # upload in order: model, scripts, then pipelines + parsed_bundle = dict(model={}, script={}, pipeline={}) + for filename, data in self.bundle.items(): + fp = Path(filename) + file_type = fp.stem.split('_')[-1] + parsed_bundle[file_type][fp.stem] = data + + model = list(parsed_bundle['model'].values())[0] + results['model'][model['model_id']] = self.upload_model(model['model_id'], model) + + for script_name, script in parsed_bundle['script'].items(): + results['script'][script_name] = self.upload_script(script_name, script) + + for pipeline_name, pipeline in parsed_bundle['pipeline'].items(): + results['pipeline'][pipeline_name] = self.upload_ingest_pipeline(pipeline_name, pipeline) + + return results + + def get_all_scripts(self) -> Dict[str, dict]: + """Get all scripts from an elasticsearch instance.""" + return self.es_client.cluster.state()['metadata']['stored_scripts'] + + def get_related_scripts(self) -> Dict[str, dict]: + """Get all scripts which start with ml_*.""" + scripts = self.get_all_scripts() + return {n: s for n, s in scripts.items() if n.lower().startswith(f'ml_{self.bundle_type}')} + + def get_related_pipelines(self) -> Dict[str, dict]: + """Get all pipelines which start with ml_*.""" + pipelines = self.ingest_client.get_pipeline() + return {n: s for n, s in pipelines.items() if n.lower().startswith(f'ml_{self.bundle_type}')} + + def get_related_model(self) -> Optional[dict]: + """Get a model from an elasticsearch instance matching the model_id.""" + for model in self.get_all_existing_model_files(): + if model['model_id'] == self.model_id: + return model + + def get_all_existing_model_files(self) -> dict: + """Get available models from a stack.""" + return self.ml_client.get_trained_models()['trained_model_configs'] + + @classmethod + def get_existing_model_ids(cls, es_client: Elasticsearch) -> List[str]: + """Get model IDs for existing ML models.""" + ml_client = MlClient(es_client) + return [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs'] + if m['model_id'] in cls.ml_manifests()] + + @classmethod + def check_model_exists(cls, es_client: Elasticsearch, model_id: str) -> bool: + """Check if a model exists on a stack by model id.""" + ml_client = MlClient(es_client) + return model_id in [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs']] + + def get_related_files(self) -> dict: + """Check for the presence and status of ML bundle files on a stack.""" + files = { + 'pipeline': self.get_related_pipelines(), + 'script': self.get_related_scripts(), + 'model': self.get_related_model(), + 'release': self.get_related_release() + } + return files + + def get_related_release(self) -> ReleaseManifest: + """Get the GitHub release related to a model.""" + return self.ml_manifests.get(self.model_id) + + @classmethod + def get_all_ml_files(cls, es_client: Elasticsearch) -> dict: + """Get all scripts, pipelines, and models which start with ml_*.""" + pipelines = IngestClient(es_client).get_pipeline() + scripts = es_client.cluster.state()['metadata']['stored_scripts'] + models = MlClient(es_client).get_trained_models()['trained_model_configs'] + manifests = get_ml_model_manifests_by_model_id() + + files = { + 'pipeline': {n: s for n, s in pipelines.items() if n.lower().startswith('ml_')}, + 'script': {n: s for n, s in scripts.items() if n.lower().startswith('ml_')}, + 'model': {m['model_id']: {'model': m, 'release': manifests[m['model_id']]} + for m in models if m['model_id'] in manifests}, + } + return files + + @classmethod + def remove_ml_scripts_pipelines(cls, es_client: Elasticsearch, ml_type: List[str]) -> dict: + """Remove all ML script and pipeline files.""" + results = dict(script={}, pipeline={}) + ingest_client = IngestClient(es_client) + + files = cls.get_all_ml_files(es_client=es_client) + for file_type, data in files.items(): + for name in list(data): + this_type = name.split('_')[1].lower() + if this_type not in ml_type: + continue + if file_type == 'script': + results[file_type][name] = es_client.delete_script(name) + elif file_type == 'pipeline': + results[file_type][name] = ingest_client.delete_pipeline(name) + + return results + + def upload_model(self, model_id: str, body: dict) -> dict: + """Upload an ML model file.""" + return self.ml_client.put_trained_model(model_id=model_id, body=body) + + def upload_script(self, script_id: str, body: dict) -> dict: + """Install a script file.""" + return self.es_client.put_script(id=script_id, body=body) + + def upload_ingest_pipeline(self, pipeline_id: str, body: dict) -> dict: + """Install a pipeline file.""" + return self.ingest_client.put_pipeline(id=pipeline_id, body=body) + + @staticmethod + def _build_script_error(exc: elasticsearch.RequestError, pipeline_file: str): + """Build an error for a failed script upload.""" + error = exc.info['error'] + cause = error['caused_by'] + error_msg = [ + f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}', + ' '.join(f'{k}: {v}' for k, v in error['position'].items()), + '\n'.join(error['script_stack']) + ] + + return click.style('\n'.join(error_msg), fg='red') + + +def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]: + """Load all ML DGA model release manifests by model id.""" + manifests, _ = ManifestManager.load_all(repo=repo) + model_manifests = {} + + for manifest_name, manifest in manifests.items(): + for asset_name, asset in manifest['assets'].items(): + for entry_name, entry_data in asset['entries'].items(): + if entry_name.startswith('dga') and entry_name.endswith('model.json'): + model_id, _ = entry_name.rsplit('_', 1) + model_manifests[model_id] = ReleaseManifest(**manifest) + break + + return model_manifests + + +@es_experimental.group('ml') +def ml_group(): + """Experimental machine learning commands.""" + + +@ml_group.command('check-files') +@click.pass_context +def check_files(ctx): + """Check ML model files on an elasticsearch instance.""" + files = MachineLearningClient.get_all_ml_files(ctx.obj['es']) + + results = [] + for file_type, data in files.items(): + if file_type == 'model': + continue + for name in list(data): + results.append({'file_type': file_type, 'name': name}) + + for model_name, model in files['model'].items(): + results.append({'file_type': 'model', 'name': model_name, 'related_release': model['release'].tag_name}) + + fields = ['file_type', 'name', 'related_release'] + table = Table.from_list(fields, results) + click.echo(table) + return files + + +@ml_group.command('remove-model') +@click.argument('model-id', required=False) +@click.pass_context +def remove_model(ctx: click.Context, model_id): + """Remove ML model files.""" + es_client = MlClient(ctx.obj['es']) + model_ids = MachineLearningClient.get_existing_model_ids(ctx.obj['es']) + + if not model_id: + model_id = click.prompt('Model ID to remove', type=click.Choice(model_ids)) + + try: + result = es_client.delete_trained_model(model_id) + except elasticsearch.ConflictError as e: + click.echo(f'{e}: try running `remove-scripts-pipelines` first') + ctx.exit(1) + + table = Table.from_list(['model_id', 'status'], [{'model_id': model_id, 'status': result}]) + click.echo(table) + return result + + +@ml_group.command('remove-scripts-pipelines') +@click.option('--dga', is_flag=True) +@click.option('--problemchild', is_flag=True) +@click.pass_context +def remove_scripts_pipelines(ctx: click.Context, **ml_types): + """Remove ML scripts and pipeline files.""" + selected_types = [k for k, v in ml_types.items() if v] + assert selected_types, f'Specify ML types to remove: {list(ml_types)}' + status = MachineLearningClient.remove_ml_scripts_pipelines(es_client=ctx.obj['es'], ml_type=selected_types) + + results = [] + for file_type, response in status.items(): + for name, result in response.items(): + results.append({'file_type': file_type, 'name': name, 'status': result}) + + fields = ['file_type', 'name', 'status'] + table = Table.from_list(fields, results) + click.echo(table) + return status + + +@ml_group.command('setup') +@click.option('--model-tag', '-t', + help='Release tag for model files staged in detection-rules (required to download files)') +@click.option('--repo', '-r', default='elastic/detection-rules', + help='GitHub repository hosting the model file releases (owner/repo)') +@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False), + help='Directory containing local model files') +@click.pass_context +def setup_bundle(ctx, model_tag, repo, model_dir): + """Upload ML model and dependencies to enrich data.""" + es_client: Elasticsearch = ctx.obj['es'] + + if model_tag: + dga_client = MachineLearningClient.from_release(es_client=es_client, release_tag=model_tag, repo=repo) + elif model_dir: + dga_client = MachineLearningClient.from_directory(es_client=es_client, directory=model_dir) + else: + return client_error('model-tag or model-dir required to download model files') + + dga_client.verify_license() + status = dga_client.setup() + + results = [] + for file_type, response in status.items(): + for name, result in response.items(): + if file_type == 'model': + status = 'success' if result.get('create_time') else 'potential_failure' + results.append({'file_type': file_type, 'name': name, 'status': status}) + continue + results.append({'file_type': file_type, 'name': name, 'status': result}) + + fields = ['file_type', 'name', 'status'] + table = Table.from_list(fields, results) + click.echo(table) + + click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo') + click.echo('To upload rules, run: kibana upload-rule ') + click.echo('To upload ML jobs, run: es experimental upload-ml-job ') + + +@ml_group.command('upload-job') +@click.argument('job-file', type=click.Path(exists=True, dir_okay=False)) +@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name') +@click.pass_context +def upload_job(ctx: click.Context, job_file, overwrite): + """Upload experimental ML jobs.""" + es_client: Elasticsearch = ctx.obj['es'] + ml_client = MlClient(es_client) + + with open(job_file, 'r') as f: + job = json.load(f) + + def safe_upload(func): + try: + func(name, body) + except (elasticsearch.ConflictError, elasticsearch.RequestError) as err: + if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception': + client_error(str(err), err, ctx=ctx) + + if overwrite: + ctx.invoke(delete_job, job_name=name, job_type=job_type) + func(name, body) + else: + client_error(str(err), err, ctx=ctx) + + try: + job_type = job['type'] + name = job['name'] + body = job['body'] + + if job_type == 'anomaly_detection': + safe_upload(ml_client.put_job) + elif job_type == 'data_frame_analytic': + safe_upload(ml_client.put_data_frame_analytics) + elif job_type == 'datafeed': + safe_upload(ml_client.put_datafeed) + else: + client_error(f'Unknown ML job type: {job_type}') + + click.echo(f'Uploaded {job_type} job: {name}') + except KeyError as e: + client_error(f'{job_file} missing required info: {e}') + + +@ml_group.command('delete-job') +@click.argument('job-name') +@click.argument('job-type') +@click.pass_context +def delete_job(ctx: click.Context, job_name, job_type, verbose=True): + """Remove experimental ML jobs.""" + es_client: Elasticsearch = ctx.obj['es'] + ml_client = MlClient(es_client) + + try: + if job_type == 'anomaly_detection': + ml_client.delete_job(job_name) + elif job_type == 'data_frame_analytic': + ml_client.delete_data_frame_analytics(job_name) + elif job_type == 'datafeed': + ml_client.delete_datafeed(job_name) + else: + client_error(f'Unknown ML job type: {job_type}') + except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e: + client_error(str(e), e, ctx=ctx) + + if verbose: + click.echo(f'Deleted {job_type} job: {job_name}') diff --git a/detection_rules/schemas/definitions.py b/detection_rules/schemas/definitions.py index 88cb9ae55..9d4bcb3c2 100644 --- a/detection_rules/schemas/definitions.py +++ b/detection_rules/schemas/definitions.py @@ -69,3 +69,8 @@ ThresholdValue = NewType("ThresholdValue", int, validate=validate.Range(min=1)) TimelineTemplateId = NewType('TimelineTemplateId', str, validate=validate.OneOf(list(TIMELINE_TEMPLATES))) TimelineTemplateTitle = NewType('TimelineTemplateTitle', str, validate=validate.OneOf(TIMELINE_TEMPLATES.values())) UUIDString = NewType('UUIDString', str, validate=validate.Regexp(UUID_PATTERN)) + + +# experimental machine learning features and releases +MachineLearningType = Literal['DGA', 'ProblemChild'] +MachineLearningTypeLower = Literal['dga', 'problemchild'] diff --git a/detection_rules/utils.py b/detection_rules/utils.py index 3cacbb828..da03b24d1 100644 --- a/detection_rules/utils.py +++ b/detection_rules/utils.py @@ -18,6 +18,7 @@ import zipfile from dataclasses import is_dataclass, astuple from datetime import datetime, date from pathlib import Path +from typing import Dict, Union import eql.utils from eql.utils import load_dump, stream_json_lines @@ -148,6 +149,24 @@ def unzip_and_save(contents, path, member=None, verbose=True): print('Saved files to {}: \n\t- {}'.format(path, '\n\t- '.join(name_list))) +def unzip_to_dict(zipped: zipfile.ZipFile, load_json=True) -> Dict[str, Union[dict, str]]: + """Unzip and load contents to dict with filenames as keys.""" + bundle = {} + for filename in zipped.namelist(): + if filename.endswith('/'): + continue + + fp = Path(filename) + contents = zipped.read(filename) + + if load_json and fp.suffix == '.json': + contents = json.loads(contents) + + bundle[fp.name] = contents + + return bundle + + def event_sort(events, timestamp='@timestamp', date_format='%Y-%m-%dT%H:%M:%S.%f%z', asc=True): """Sort events from elasticsearch by timestamp.""" diff --git a/docs/ML_DGA.md b/docs/ML_DGA.md deleted file mode 100644 index 6271ac74c..000000000 --- a/docs/ML_DGA.md +++ /dev/null @@ -1,191 +0,0 @@ -# Machine Learning on Domain Generation Algorithm (DGA) - -Several blogs were put out on how you can create and leverage supervised DGA ML models to enrich data within the stack. -* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity) -* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data) - -You can also find some supplementary and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection) - -We also released a blog on getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst): -* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection) - - -For questions, please reach out to the ML team in the #machine-learning channel of the -[Elastic public slack channel](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack) - -They can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning) - -*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)* -*Note: the ML features are considered experimental in Kibana as well as this rules CLI* - -## Releases - -Models and dependencies will be [released](https://github.com/elastic/detection-rules/releases) as `ML-DGA-YYYMMDD-N`. -This tag name is what will need to be passed to the CLI command. - -## Uploading a model and dependencies using the CLI - -### Usage - -```console -python -m detection_rules es experimental setup-dga-model -h - -Elasticsearch client: -Options: - -et, --timeout INTEGER Timeout for elasticsearch client - -ep, --es-password TEXT - -eu, --es-user TEXT - --cloud-id TEXT - --elasticsearch-url TEXT - - -* experimental commands are use at your own risk and may change without warning * - -Usage: detection_rules es experimental setup-dga-model [OPTIONS] - - Upload ML DGA model and dependencies and enrich DNS data. - -Options: - -t, --model-tag TEXT Release tag for model files staged in detection- - rules (required to download files) - -r, --repo TEXT GitHub repository hosting the model file releases - (owner/repo) - -d, --model-dir DIRECTORY Directory containing local model files - --overwrite Overwrite all files if already in the stack - -h, --help Show this message and exit. - -``` - -### Detailed steps - -#### 1. Upload and setup the model file and dependencies - -Run `python -m detection_rules es experimental setup-dga-model -t ` - -*If updating a new model, you should first uninstall any existing models using `remove-dga-model`* - -You can also upload files locally using the `-d` option, so long as the naming convention of the files match the -expected pattern for the filenames. - -#### 2. Update packetbeat configuration - -You will need to update your packebeat.yml config file to point to the enrichment pipeline - -Under `Elasticsearch Output` add the following: - -```yaml -output.elasticsearch: - hosts: ["your-hostname:your-port"] - pipeline: dns_enrich_pipeline -``` - -#### 3. Refresh your packetbeat index - -You can optionally choose to refresh your packetbeat index mapping within Kibana: -* navigate to `Stack Management > (Kibana) Index Patterns` -* select the applicable packetbeat index -* click `refresh field list` - -#### 4. Verify enrichment fields - -Any packetbeat documents with the field `dns.question.registered_domain` should now have the enriched data: -`ml_is_dga.*` - - -## Experimental DGA ML Jobs and Rules - -Once packetbeat data is being enriched, there are some rules and ML jobs which can leverage the enriched fields. -The experimental rules and jobs will be staged separate from the model bundle under the [releases](https://github.com/elastic/detection-rules/releases) -as `ML-experimental-detections-YYYMMDD-N`. These releases should be considered independent of each other. Any relation -to previously released experimental detections will be mentioned in the accompanying readme (such as an update to a rule). - -Note that if a rule is of `type = "machine_learning"`, then it may be dependent on a uploading and running a machine -learning job first. If this is the case, it will likely be annotated within the `note` field of the rule. - -#### Uploading rules - -You can then individually upload these rules using the [kibana upload-rule](../CLI.md#uploading-rules-to-kibana) command - -#### Uploading ML Jobs - -Unzip released jobs and then run `python -m detection_rules es experimental upload-ml-job ` - -To delete a job, run `python -m detection_rules es experimental delete-ml-job ` - -Take note of any errors as some jobs may have dependencies on each other which may require stopping and or removing -referenced jobs first. - - -## For Maintainers - -### Validating release bundles and releasing - -Release assets are expected to be in certain formats with specific naming patterns and json structures. - -#### Filename patterns - -DGA model file naming convention should match the following patterns - -```json -{ - "model": "dga_*_model.json", - "dga_ngrams_create": "dga_*_ngrams_create.json", - "dga_ngrams_transform_delete": "dga_*_ngrams_transform_delete.json", - "dns_enrich_pipeline": "dga_*_ingest_pipeline1.json", - "dns_dga_inference_enrich_pipeline": "dga_*_ingest_pipeline2.json" -} -``` - -Experimental detections do not have to match a specific naming pattern but should be in the following file formats: -* rules: toml -* jobs: json - -#### Uniqueness - -The model file name and hash should be unique or else it will raise a warning in validation. This is important to allow -distinction and ascertain information about a bundle by consulting the manifest, based on a unique name - -Release zipped assets, name, and tag name all share the same name. These should follow the following format: -* Model releases: `ML-DGA-YYYYMMDD-1` -* Detection releases: `ML-experimental-detections-YYYYMMDD-1` - -The trailing digit should be incremented for each release - -Rule and Job names should also be unique - -#### Rule and job structure - -Rules files are only check if they are valid toml, nothing more. Consult existing production rules and schemas for API -expectations - -Job files are checked if they are valid toml and contain the following top level fields: -* name - job name -* type - job type -* body - the actual ML job data. The contents are not validated - -#### Validation - -All of these checks are automated and can be called with: -`python -m detection_rules dev gh-release validate-ml-dga-asset` - for model bundles -`python -m detection_rules dev gh-release validate-ml-detections-asset` for rule/job bundles - -Pay attention to the output to determine any necessary changes. This may not be all inclusive and actual testing on a -live stack should always occur even with passing validation before saving to a GitHub release - -#### Including a readme for detections release - -`ML-experimental-detections-*` releases will need to include a readme to provide an overview of the included files - -#### Releasing - -Install dependencies with `pip install -r requirements-dev.txt` - -A release can be created via the cli using `python -m detection_rules dev gh-release create-ml` - -* you can only use a github token -* the base directory name and release name must match -* you must have write permissions to the repo to create a release -* validation also occurs on this, with a prompt to proceed -* upon completion, a manifest is uploaded as an asset to the GitHub release - -To test, you can fork the repo and use `--repo ` to validate a release is working as expected diff --git a/docs/experimental-machine-learning/DGA.md b/docs/experimental-machine-learning/DGA.md new file mode 100644 index 000000000..c1eb8603a --- /dev/null +++ b/docs/experimental-machine-learning/DGA.md @@ -0,0 +1,54 @@ +# Machine Learning on Domain Generation Algorithm (DGA) + +To create and use supervised DGA ML models to enrich data within the stack, check out these Elastic blogs: +* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity) +* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data) + +You can also find some supplementary material and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection) + +We also released a blog about getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst): +* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection) + + +For questions, please reach out to the ML team in the #machine-learning channel of the +[Elastic community Slack workspace](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack) + +The team can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning) + +*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)* +*Note: the ML features are considered experimental in Kibana as well as this rules CLI* + + +## Detailed steps + +#### 1. Upload and setup the model file and dependencies + +Run `python -m detection_rules es experimental ml setup -t ` + +*If updating a new model, you should first uninstall any existing models using `remove-model`* + +You can also upload files locally using the `-d` option, so long as the naming convention of the files match the +expected pattern for the filenames. + +#### 2. Update packetbeat configuration + +You will need to update your packetbeat.yml config file to point to the enrichment pipeline + +Under `Elasticsearch Output` add the following: + +```yaml +output.elasticsearch: + hosts: ["your-hostname:your-port"] + pipeline: dns_enrich_pipeline +``` + +#### 3. Refresh your packetbeat index + +You can optionally choose to refresh your packetbeat index mapping from within Kibana: +* Navigate to `Stack Management > (Kibana) Index Patterns` +* Select the appropriate packetbeat index +* Click `refresh field list` + +#### 4. Verify enrichment fields + +Any packetbeat documents with the field `dns.question.registered_domain` should now be enriched with `ml_is_dga.*` diff --git a/docs/experimental-machine-learning/experimental-detections.md b/docs/experimental-machine-learning/experimental-detections.md new file mode 100644 index 000000000..a765321a9 --- /dev/null +++ b/docs/experimental-machine-learning/experimental-detections.md @@ -0,0 +1,28 @@ +# Experimental ML Jobs and Rules + +The ingest pipeline enriches process events by adding additional fields, which are used to power several rules. +The experimental rules and jobs are staged separately from the model bundle under [releases](https://github.com/elastic/detection-rules/releases), with the tag `ML-experimental-detections-YYYMMDD-N`. New releases with this tag may contain either updates to existing rules or new experimental detcetions. + +Note that if a rule is of `type = "machine_learning"`, then it may be dependent on uploading and running a machine +learning job first. If this is the case, it will likely be annotated within the `note` field of the rule. + +### Uploading rules + +Unzip the release bundle and upload these rules individually. + +Rules are now stored in ndjson format and can be imported into Kibana via the security app detections page. + +Earlier releases stored the rules in toml format. These can be uploaded using the +[7.12 branch](https://github.com/elastic/detection-rules/tree/7.12) CLI using the +[kibana upload-rule](../../CLI.md#uploading-rules-to-kibana) command + +### Uploading ML Jobs and Datafeeds + +Unzip the release bundle and then run `python -m detection_rules es experimental ml upload-job ` + +To delete a job/datafeed, run `python -m detection_rules es experimental ml delete-job ` + +The CLI automatically identifies whether the provided input file is an ML job or datafeed. + +Take note of any errors as the jobs and datafeeds may have dependencies on each other which may require stopping and/or removing +referenced jobs/datafeeds first. diff --git a/docs/experimental-machine-learning/problem-child.md b/docs/experimental-machine-learning/problem-child.md new file mode 100644 index 000000000..26e943d05 --- /dev/null +++ b/docs/experimental-machine-learning/problem-child.md @@ -0,0 +1,64 @@ +# ProblemChild in the Elastic Stack + +ProblemChild helps detect anomalous activity in Windows process events by: +1) Classifying events as malicious vs benign +2) Identifying anomalous events based on rare parent-child process relationships + +An end-to-end blog on how to build the ProblemChild framework from scratch for your environment can be found [here](https://www.elastic.co/blog/problemchild-detecting-living-off-the-land-attacks). + +You can also find some supplementary material for the blog and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/ProblemChild) + +We also released a blog about getting started with ProblemChild using the CLI and Kibana: +* [ProblemChild Release Blog](link to blog) + + +*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)* +*Note: the ML features are considered experimental in Kibana as well as this rules CLI* + + +## Detailed steps + +#### 1. Upload and setup the model file and dependencies + +Run `python -m detection_rules es experimental ml setup -t ` + +*If updating a new model, you should first uninstall any existing models using `remove-model`* + +You can also upload files locally using the `-d` option, so long as the naming convention of the files match the +expected pattern for the filenames. + +#### 2. Update index pipeline configuration + +You will need to update your index (containing Windows process event data) settings to point to the ProblemChild enrichment pipeline. + +You can do this by running the following command in your Dev Tools console: +``` +PUT your-index-pattern/_settings +{ + "index": { + "default_pipeline": "ML_ProblemChild_ingest_pipeline" + } +} +``` + +If you wish to stop enriching your documents using ProblemChild, run the following command in your dev Tools console: +``` +PUT your-index-pattern/_settings +{ + "index": { + "default_pipeline": null + } +} + +``` + +#### 3. Refresh your indexes + +You can optionally choose to refresh your index mapping from within Kibana: +* Navigate to `Stack Management > (Kibana) Index Patterns` +* Select the appropriate indexes +* Click `refresh field list` + +#### 4. Verify enrichment fields + +Any documents corresponding to Windows process events should now be enriched with `problemchild.*` \ No newline at end of file diff --git a/docs/experimental-machine-learning/readme.md b/docs/experimental-machine-learning/readme.md new file mode 100644 index 000000000..e13c347d7 --- /dev/null +++ b/docs/experimental-machine-learning/readme.md @@ -0,0 +1,102 @@ +# Experimental machine learning + +This repo contains some additional information and files to use experimental[*](#what-does-experimental-mean-in-this-context) machine learning features and detections + +## Features +* [DGA](DGA.md) +* [ProblemChild](problem_child.md) +* [experimental detections](experimental-detections.md) + +## Releases + +There are separate [releases](https://github.com/elastic/detection-rules/releases) for: +* DGA: `ML-DGA-*` +* problem child: `ML-ProblemChild-*` +* experimental detections: `ML-experimental-detections-*` + +Releases will use the tag `ML-TYPE-YYYMMDD-N`, which will be needed for uploading the model using the CLI. + + +## CLI + +Support commands can be found under `python -m detection_rules es experimental ml -h` + +```console +Elasticsearch client: +Options: + -et, --timeout INTEGER Timeout for elasticsearch client + -ep, --es-password TEXT + -eu, --es-user TEXT + --elasticsearch-url TEXT + --cloud-id TEXT + + +* experimental commands are use at your own risk and may change without warning * + +Usage: detection_rules es experimental ml [OPTIONS] COMMAND [ARGS]... + + Experimental machine learning commands. + +Options: + -h, --help Show this message and exit. + +Commands: + check-files Check ML model files on an elasticsearch... + delete-job Remove experimental ML jobs. + remove-model Remove ML model files. + remove-scripts-pipelines Remove ML scripts and pipeline files. + setup Upload ML model and dependencies to enrich data. + upload-job Upload experimental ML jobs. +``` + +## Managing a model and dependencies using the CLI + +### Installing + +```console +python -m detection_rules es experimental ml setup -h + +Elasticsearch client: +Options: + -et, --timeout INTEGER Timeout for elasticsearch client + -ep, --es-password TEXT + -eu, --es-user TEXT + --cloud-id TEXT + --elasticsearch-url TEXT + + +* experimental commands are use at your own risk and may change without warning * + +Usage: detection_rules es experimental ml setup [OPTIONS] + + Upload ML model and dependencies to enrich data. + +Options: + -t, --model-tag TEXT Release tag for model files staged in detection- + rules (required to download files) + -r, --repo TEXT GitHub repository hosting the model file releases + (owner/repo) + -d, --model-dir DIRECTORY Directory containing local model files + --overwrite Overwrite all files if already in the stack + -h, --help Show this message and exit. + +``` + +### Removing + +To remove the ML bundle, you will need to remove the pipelines and scripts first and then the model. + +You can do this by running: +* `python -m detection_rules es experimental ml remove-pipeline-scripts --dga --problemchild` +* `python -m detection_rules es experimental ml remove-model ` + + +---- + +##### What does experimental mean in this context? + +Experimental model bundles (models, scripts, and pipelines), rules, and jobs are components which are currently in +development and so may not have completed the testing or scrutiny which full production detections are subjected to. + +It may also make use of features which are not yet GA and so may be subject to change and are not covered by the support +SLA of general release (GA) features. Some of these features may also never make it to GA. \ No newline at end of file