Refactor experimental ML CLI and code (#1218)

* move github and ml to their own files
* refactor release and ml commands
* update ML readmes
* add unzip_to_dict function
* prompt for model ID in remove-model
* update experimental rule upload process
* update remove-scripts-pipelines to take multiple options

Co-authored-by: Ross Wolf <31489089+rw-access@users.noreply.github.com>
Co-authored-by: Apoorva <appujo@gmail.com>
This commit is contained in:
Justin Ibarra
2021-06-02 20:37:12 -08:00
committed by GitHub
parent e46f5e96d3
commit 0ec8d67e78
15 changed files with 1039 additions and 1044 deletions
+2
View File
@@ -107,9 +107,11 @@ ENV/
/_extras/
# detection rules
.detection-rules-cfg.json
releases/
collections/
enriched-rule-indexes/
exports/
ML-models/
surveys/
machine-learning/
+4
View File
@@ -12,9 +12,11 @@ from . import ( # noqa: E402
devtools,
docs,
eswrap,
ghwrap,
kbwrap,
main,
mappings,
ml,
misc,
rule_formatter,
rule_loader,
@@ -26,10 +28,12 @@ __all__ = (
'devtools',
'docs',
'eswrap',
'ghwrap',
'kbwrap',
'mappings',
"main",
'misc',
'ml',
'rule_formatter',
'rule_loader',
'schemas',
+1 -2
View File
@@ -67,14 +67,13 @@ def multi_collection(f):
@click.option('--rule-id', '-id', multiple=True, required=False)
@functools.wraps(f)
def get_collection(*args, **kwargs):
rule_name: List[str] = kwargs.pop("rule_name", [])
rule_id: List[str] = kwargs.pop("rule_id", [])
rule_files: List[str] = kwargs.pop("rule_file")
directories: List[str] = kwargs.pop("directory")
rules = RuleCollection()
if not (rule_name or rule_id or rule_files):
if not (directories or rule_id or rule_files):
client_error('Required: at least one of --rule-id, --rule-file, or --directory')
rules.load_files(Path(p) for p in rule_files)
+11 -236
View File
@@ -6,7 +6,6 @@
"""CLI commands for internal detection_rules dev team."""
import dataclasses
import functools
import hashlib
import io
import json
import os
@@ -25,10 +24,11 @@ from kibana.connector import Kibana
from . import rule_loader
from .cli_utils import single_collection
from .eswrap import CollectEvents, add_range_to_dsl
from .ghwrap import GithubClient
from .main import root
from .misc import GithubClient, Manifest, PYTHON_LICENSE, add_client, client_error, getdefault
from .misc import PYTHON_LICENSE, add_client, client_error
from .packaging import PACKAGE_FILE, Package, RELEASE_DIR, current_stack_version, manage_versions
from .rule import QueryRuleData, TOMLRule
from .rule import AnyRuleData, BaseRuleData, QueryRuleData, TOMLRule
from .rule_loader import RuleCollection, production_filter
from .utils import dict_hash, get_path, load_dump
@@ -417,6 +417,14 @@ def deprecate_rule(ctx: click.Context, rule_file: str):
click.echo(f'Rule moved to {deprecated_path} - remember to git add this file')
@dev_group.command("update-schemas")
def update_schemas():
classes = [BaseRuleData] + list(typing.get_args(AnyRuleData))
for cls in classes:
cls.save_schema()
@dev_group.group('test')
def test_group():
"""Commands for testing against stack resources."""
@@ -557,236 +565,3 @@ def rule_survey(ctx: click.Context, query, date_range, dump_file, hide_zero_coun
json.dump(details, f, indent=2, sort_keys=True)
return survey_results
@dev_group.group('gh-release')
def gh_release_group():
"""Commands to manage GitHub releases."""
@gh_release_group.command('create-ml')
@click.argument('directory', type=click.Path(dir_okay=True, file_okay=False))
@click.option('--gh-token', '-t', default=getdefault('gh_token'))
@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo')
@click.option('--release-name', '-n', required=True, help='Name of release')
@click.option('--description', '-d', help='Description of release to append to default message')
@click.pass_context
def create_ml_release(ctx, directory, gh_token, repo, release_name, description):
"""Create a GitHub release."""
import re
# ML-DGA-20201129-25
pattern = r'^(ML-DGA|ML-experimental-detections)-\d{4}\d{2}\d{2}-\d+$'
assert re.match(pattern, release_name), f'release name must match pattern: {pattern}'
assert Path(directory).name == release_name, f'directory name must match release name: {release_name}'
gh_token = gh_token or click.prompt('GitHub token', hide_input=True)
client = GithubClient(gh_token)
gh_repo = client.authenticated_client.get_repo(repo)
# validate tag name is increment by 1
name_prefix, _, version = release_name.rsplit('-', 2)
version = int(version)
releases = gh_repo.get_releases()
max_ver = max([int(r.raw_data['name'].split('-')[-1]) for r in releases
if r.raw_data['name'].startswith(name_prefix)], default=0)
if version != (max_ver + 1):
client_error(f'Last release version was {max_ver}. Release name should end with version: {max_ver + 1}')
# validate files
if name_prefix == 'ML-DGA':
zipped_bundle, description_str = ctx.invoke(validate_ml_dga_asset, directory=directory, repo=repo)
else:
zipped_bundle, description_str = ctx.invoke(validate_ml_detections_asset, directory=directory)
click.confirm('Validation passed, verify output. Continue?')
if description:
description_str = f'{description_str}\n\n----\n\n{description}'
release = gh_repo.create_git_release(name=release_name, tag=release_name, message=description_str)
zip_name = Path(zipped_bundle).name
click.echo(f'release created at: {release.html_url}')
# add zipped bundle as an asset to the release
click.echo(f'Uploading zip file: {zip_name}')
release.upload_asset(zipped_bundle, label=zip_name, name=zip_name, content_type='application/zip')
# create manifest entry
click.echo('creating manifest for release')
manifest = Manifest(repo, tag_name=release_name, token=gh_token)
manifest.save()
return release
@gh_release_group.command('validate-ml-dga-asset')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo')
def validate_ml_dga_asset(directory, repo):
""""Validate and prep an ML DGA bundle for release."""
from .eswrap import expected_ml_dga_patterns
now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
files = list(Path(directory).glob('*'))
if len(files) > 5:
client_error('Too many files, expected 5')
click.secho('[*] validated expected number of files', fg='green')
# backup files - will re-save sorted to have deterministic hash
backup_path = Path(directory).resolve().parent.joinpath(f'backups-{Path(directory).name}-{now.replace(":", "-")}')
shutil.copytree(directory, backup_path)
# validate file names and json and load
loaded_contents = {}
for name, pattern in expected_ml_dga_patterns.items():
path = list(Path(directory).glob(pattern))
match_count = len(path)
if match_count != 1:
client_error(f'Expected filename pattern "{pattern}" for "{name}": {match_count} matches detected')
file_path = path[0]
try:
with open(file_path, 'r') as f:
contents = json.dumps(json.load(f), sort_keys=True)
loaded_contents[name] = {'contents': contents, 'filename': file_path}
sha256 = hashlib.sha256(contents.encode('utf-8')).hexdigest()
click.secho(f' - sha256: {sha256} - {name}')
# re-save sorted
with open(file_path, 'w') as f:
f.write(contents)
except json.JSONDecodeError as e:
client_error(f'Invalid JSON in {file_path} file', e)
model_filename = Path(loaded_contents['model']['filename']).name
model_name, _ = model_filename.rsplit('_', maxsplit=1)
click.secho('[*] re-saved all files with keys sorted for deterministic hashing', fg='green')
click.secho(f' [+] backups saved to: {backup_path}')
click.secho('[*] validated expected naming patterns for all files', fg='green')
click.secho('[*] validated json formatting of all files', fg='green')
# check manifest for existing things
existing_sha = False
existing_model_name = False
model_hash = hashlib.sha256(loaded_contents['model']['contents'].encode('utf-8')).hexdigest()
manifest_hashes = Manifest.get_existing_asset_hashes(repo)
for release, file_data in manifest_hashes.items():
for file_name, sha in file_data.items():
if model_hash == sha:
existing_sha = True
click.secho(f'[!] hash for model file: "{loaded_contents["model"]["filename"]}" matches: '
f'{release} -> {file_name} -> {sha}', fg='yellow')
if model_filename == file_name:
existing_model_name = True
client_error(f'name for model file: "{loaded_contents["model"]["filename"]}" matches: '
f'{release} -> {file_name} -> {file_name}')
if not existing_sha:
click.secho(f'[+] validated no existing models matched hashes for: '
f'{loaded_contents["model"]["filename"]}', fg='green')
if not existing_model_name:
click.secho(f'[+] validated no existing models matched names for: '
f'{loaded_contents["model"]["filename"]}', fg='green')
# save zip
zip_name_no_ext = Path(directory).resolve()
zip_name = f'{zip_name_no_ext}.zip'
shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name)
click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green')
click.secho(f'[!] run `setup-dga-model -d {directory}` to test this on a live stack before releasing', fg='yellow')
description = {
'model_name': model_name + '\n\n----\n\n',
'date': now,
'model_sha256': model_hash,
'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md'
}
description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()])
click.echo()
click.echo(f'[*] description to paste with release:\n\n{description_str}\n')
return zip_name, description_str
@gh_release_group.command('validate-ml-detections-asset')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
def validate_ml_detections_asset(directory):
"""Validate and prep ML detection rules and jobs before release."""
import pytoml
now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
all_files = list(Path(directory).glob('*'))
job_paths = [f for f in all_files if f.suffix == '.json']
rule_paths = [f for f in all_files if f.suffix == '.toml']
other_paths = [f for f in Path(directory).glob('*') if f.suffix not in ('.toml', '.json')]
job_count = len(job_paths)
rule_count = len(rule_paths)
other_count = len(other_paths)
if 'readme.md' not in [f.name.lower() for f in other_paths]:
client_error('Release is missing readme file')
for job in job_paths:
try:
with open(job, 'r') as f:
j = json.load(f)
assert j.get('name'), click.style(f'[!] job file "{job}" missing: name', fg='red')
assert j.get('type'), click.style(f'[!] job file "{job}" missing: type', fg='red')
assert j.get('body'), click.style(f'[!] job file "{job}" missing: body', fg='red')
except json.JSONDecodeError as e:
client_error(f'Invalid JSON in {job} file', e)
click.secho(f'[*] validated json formatting and required fields in {job_count} job files', fg='green')
for rule in rule_paths:
with open(rule, 'r') as f:
try:
pytoml.load(f)
except pytoml.TomlError as e:
client_error(f'[!] invalid rule toml for: {rule}', e)
click.secho(f'[*] validated toml formatting for {rule_count} rule files', fg='green')
# save zip
zip_name_no_ext = Path(directory).resolve()
zip_name = f'{zip_name_no_ext}.zip'
shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name)
click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green')
click.secho('[!] run `kibana upload-rule` to test rules on a live stack before releasing', fg='green')
click.secho('[!] run `es upload-ml-job` to test jobs on a live stack before releasing', fg='green')
description = {
'Experimental rules': rule_count,
'Experimental ML jobs': job_count,
'Other files': str(other_count) + '\n\n----\n\n',
'DGA release': '<add link to DGA release these detections were built on>',
'date': now,
'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md'
}
description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()])
click.echo()
click.echo(f'description to paste with release:\n\n{description_str}\n')
return zip_name, description_str
@dev_group.command("update-schemas")
def update_schemas():
from .rule import BaseRuleData, AnyRuleData
classes = [BaseRuleData] + list(typing.get_args(AnyRuleData))
for cls in classes:
cls.save_schema()
+1 -317
View File
@@ -8,14 +8,12 @@ import json
import os
import time
from collections import defaultdict
from contextlib import contextmanager
from pathlib import Path
from typing import Union
import click
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch.client import AsyncSearchClient, IngestClient, LicenseClient, MlClient
from elasticsearch.client import AsyncSearchClient
import kql
from .main import root
@@ -422,317 +420,3 @@ def index_repo(ctx: click.Context, query, from_file, save_files):
def es_experimental():
"""[Experimental] helper commands for integrating with Elasticsearch."""
click.secho('\n* experimental commands are use at your own risk and may change without warning *\n')
@es_experimental.command('check-model-files')
@click.pass_context
def check_model_files(ctx):
"""Check ML model files on an elasticsearch instance."""
from elasticsearch.client import IngestClient, MlClient
from .misc import get_ml_model_manifests_by_model_id
es_client: Elasticsearch = ctx.obj['es']
ml_client = MlClient(es_client)
ingest_client = IngestClient(es_client)
def safe_get(func, arg):
try:
return func(arg)
except elasticsearch.NotFoundError:
return None
models = [m for m in ml_client.get_trained_models().get('trained_model_configs', [])
if m['created_by'] != '_xpack']
if models:
if len([m for m in models if m['model_id'].startswith('dga_')]) > 1:
click.secho('Multiple DGA models detected! It is not recommended to run more than one DGA model at a time',
fg='yellow')
manifests = get_ml_model_manifests_by_model_id()
click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:')
for model in models:
manifest = manifests.get(model['model_id'])
click.echo(f' - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}')
else:
click.echo('No DGA Models found')
support_files = {
'create_script': safe_get(es_client.get_script, 'dga_ngrams_create'),
'delete_script': safe_get(es_client.get_script, 'dga_ngrams_transform_delete'),
'enrich_pipeline': safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'),
'inference_pipeline': safe_get(ingest_client.get_pipeline, 'dns_dga_inference_enrich_pipeline')
}
click.echo('Support Files:')
for support_file, results in support_files.items():
click.echo(f' - {support_file}: {"found" if results else "not found"}')
@es_experimental.command('remove-dga-model')
@click.argument('model-id')
@click.option('--force', '-f', is_flag=True, help='Force the attempted delete without checking if model exists')
@click.pass_context
def remove_dga_model(ctx, model_id, force, es_client: Elasticsearch = None, ml_client: MlClient = None,
ingest_client: IngestClient = None):
"""Remove ML DGA files."""
from elasticsearch.client import IngestClient, MlClient
es_client = es_client or ctx.obj['es']
ml_client = ml_client or MlClient(es_client)
ingest_client = ingest_client or IngestClient(es_client)
def safe_delete(func, fid, verbose=True):
try:
func(fid)
except elasticsearch.NotFoundError:
return False
if verbose:
click.echo(f' - {fid} deleted')
return True
model_exists = False
if not force:
existing_models = ml_client.get_trained_models()
model_exists = model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]
if model_exists or force:
if model_exists:
click.secho('[-] Existing model detected - deleting files', fg='yellow')
deleted = [
safe_delete(ingest_client.delete_pipeline, 'dns_dga_inference_enrich_pipeline'),
safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'),
safe_delete(es_client.delete_script, 'dga_ngrams_transform_delete'),
# f'{model_id}_dga_ngrams_transform_delete'
safe_delete(es_client.delete_script, 'dga_ngrams_create'),
# f'{model_id}_dga_ngrams_create'
safe_delete(ml_client.delete_trained_model, model_id)
]
if not any(deleted):
click.echo('No files deleted')
else:
click.echo(f'Model: {model_id} not found')
expected_ml_dga_patterns = {
'model': 'dga_*_model.json', # noqa: E241
'dga_ngrams_create': 'dga_*_ngrams_create.json', # noqa: E241
'dga_ngrams_transform_delete': 'dga_*_ngrams_transform_delete.json', # noqa: E241
'dns_enrich_pipeline': 'dga_*_ingest_pipeline1.json', # noqa: E241
'dns_dga_inference_enrich_pipeline': 'dga_*_ingest_pipeline2.json' # noqa: E241
}
@es_experimental.command('setup-dga-model')
@click.option('--model-tag', '-t',
help='Release tag for model files staged in detection-rules (required to download files)')
@click.option('--repo', '-r', default='elastic/detection-rules',
help='GitHub repository hosting the model file releases (owner/repo)')
@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False),
help='Directory containing local model files')
@click.option('--overwrite', is_flag=True, help='Overwrite all files if already in the stack')
@click.pass_context
def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite):
"""Upload ML DGA model and dependencies and enrich DNS data."""
import io
import requests
import shutil
import zipfile
es_client: Elasticsearch = ctx.obj['es']
client_info = es_client.info()
license_client = LicenseClient(es_client)
if license_client.get()['license']['type'].lower() not in ('platinum', 'enterprise'):
client_error('You must have a platinum or enterprise subscription in order to use these ML features')
# download files if necessary
if not model_dir:
if not model_tag:
client_error('model-tag or model-dir required to download model files')
click.echo(f'Downloading artifact: {model_tag}')
release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}'
release = requests.get(release_url)
release.raise_for_status()
assets = [a for a in release.json()['assets'] if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')]
if len(assets) != 1:
client_error(f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!')
zipped_url = assets[0]['browser_download_url']
zipped = requests.get(zipped_url)
z = zipfile.ZipFile(io.BytesIO(zipped.content))
dga_dir = get_path('ML-models', 'DGA')
model_dir = os.path.join(dga_dir, model_tag)
os.makedirs(dga_dir, exist_ok=True)
shutil.rmtree(model_dir, ignore_errors=True)
z.extractall(dga_dir)
click.echo(f'files saved to {model_dir}')
# read files as needed
z.close()
def get_model_filename(pattern):
paths = list(Path(model_dir).glob(pattern))
if not paths:
client_error(f'{model_dir} missing files matching the pattern: {pattern}')
if len(paths) > 1:
client_error(f'{model_dir} contains multiple files matching the pattern: {pattern}')
return paths[0]
@contextmanager
def open_model_file(name):
pattern = expected_ml_dga_patterns[name]
with open(get_model_filename(pattern), 'r') as f:
yield json.load(f)
model_id, _ = os.path.basename(get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1)
click.echo(f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})')
# upload model
ml_client = MlClient(es_client)
ingest_client = IngestClient(es_client)
existing_models = ml_client.get_trained_models()
if model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]:
if overwrite:
ctx.invoke(remove_dga_model, model_id=model_id, es_client=es_client, ml_client=ml_client,
ingest_client=ingest_client, force=True)
else:
client_error(f'Model: {model_id} already exists on stack! Try --overwrite to force the upload')
click.secho('[+] Uploading model (may take a while)')
with open_model_file('model') as model_file:
try:
ml_client.put_trained_model(model_id=model_id, body=model_file)
except elasticsearch.ConnectionTimeout:
msg = 'Connection timeout, try increasing timeout using `es --timeout <secs> experimental setup_dga_model`.'
client_error(msg)
# install scripts
click.secho('[+] Uploading painless scripts')
with open_model_file('dga_ngrams_create') as painless_install:
es_client.put_script(id='dga_ngrams_create', body=painless_install)
# f'{model_id}_dga_ngrams_create'
with open_model_file('dga_ngrams_transform_delete') as painless_delete:
es_client.put_script(id='dga_ngrams_transform_delete', body=painless_delete)
# f'{model_id}_dga_ngrams_transform_delete'
# Install ingest pipelines
click.secho('[+] Uploading pipelines')
def _build_es_script_error(err, pipeline_file):
error = err.info['error']
cause = error['caused_by']
error_msg = [
f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
' '.join(f'{k}: {v}' for k, v in error['position'].items()),
'\n'.join(error['script_stack'])
]
return click.style('\n'.join(error_msg), fg='red')
with open_model_file('dns_enrich_pipeline') as ingest_pipeline1:
try:
ingest_client.put_pipeline(id='dns_enrich_pipeline', body=ingest_pipeline1)
except elasticsearch.RequestError as e:
if e.error == 'script_exception':
client_error(_build_es_script_error(e, 'ingest_pipeline1'), e, ctx=ctx)
else:
raise
with open_model_file('dns_dga_inference_enrich_pipeline') as ingest_pipeline2:
try:
ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline', body=ingest_pipeline2)
except elasticsearch.RequestError as e:
if e.error == 'script_exception':
client_error(_build_es_script_error(e, 'ingest_pipeline2'), e, ctx=ctx)
else:
raise
click.echo('Ensure that you have updated your packetbeat.yml config file.')
click.echo(' - reference: ML_DGA.md #2-update-packetbeat-configuration')
click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo')
click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
click.echo('To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
@es_experimental.command('upload-ml-job')
@click.argument('job-file', type=click.Path(exists=True, dir_okay=False))
@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name')
@click.pass_context
def upload_ml_job(ctx: click.Context, job_file, overwrite):
"""Upload experimental ML jobs."""
es_client: Elasticsearch = ctx.obj['es']
ml_client = MlClient(es_client)
with open(job_file, 'r') as f:
job = json.load(f)
def safe_upload(func):
try:
func(name, body)
except (elasticsearch.ConflictError, elasticsearch.RequestError) as err:
if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception':
client_error(str(err), err, ctx=ctx)
if overwrite:
ctx.invoke(delete_ml_job, job_name=name, job_type=job_type)
func(name, body)
else:
client_error(str(err), err, ctx=ctx)
try:
job_type = job['type']
name = job['name']
body = job['body']
if job_type == 'anomaly_detection':
safe_upload(ml_client.put_job)
elif job_type == 'data_frame_analytic':
safe_upload(ml_client.put_data_frame_analytics)
elif job_type == 'datafeed':
safe_upload(ml_client.put_datafeed)
else:
client_error(f'Unknown ML job type: {job_type}')
click.echo(f'Uploaded {job_type} job: {name}')
except KeyError as e:
client_error(f'{job_file} missing required info: {e}')
@es_experimental.command('delete-ml-job')
@click.argument('job-name')
@click.argument('job-type')
@click.pass_context
def delete_ml_job(ctx: click.Context, job_name, job_type, verbose=True):
"""Remove experimental ML jobs."""
es_client: Elasticsearch = ctx.obj['es']
ml_client = MlClient(es_client)
try:
if job_type == 'anomaly_detection':
ml_client.delete_job(job_name)
elif job_type == 'data_frame_analytic':
ml_client.delete_data_frame_analytics(job_name)
elif job_type == 'datafeed':
ml_client.delete_datafeed(job_name)
else:
client_error(f'Unknown ML job type: {job_type}')
except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e:
client_error(str(e), e, ctx=ctx)
if verbose:
click.echo(f'Deleted {job_type} job: {job_name}')
+295
View File
@@ -0,0 +1,295 @@
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
"""Schemas and dataclasses for GitHub releases."""
import dataclasses
import hashlib
import io
import json
import shutil
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, Tuple
from zipfile import ZipFile
import click
import requests
from .schemas import definitions
# this is primarily for type hinting - all use of the github client should come from GithubClient class
try:
from github import Github
from github.Repository import Repository
from github.GitRelease import GitRelease
from github.GitReleaseAsset import GitReleaseAsset
except ImportError:
# for type hinting
Github = None # noqa: N806
Repository = None # noqa: N806
GitRelease = None # noqa: N806
GitReleaseAsset = None # noqa: N806
def get_gh_release(repo: Repository, release_name: Optional[str] = None, tag_name: Optional[str] = None) -> GitRelease:
"""Get a list of GitHub releases by repo."""
assert release_name or tag_name, 'Must specify a release_name or tag_name'
releases = repo.get_releases()
for release in releases:
if release_name and release_name == release.title:
return release
elif tag_name and tag_name == release.tag_name:
return release
def load_zipped_gh_assets_with_metadata(url: str) -> Tuple[str, dict]:
"""Download and unzip a GitHub assets."""
response = requests.get(url)
zipped_asset = ZipFile(io.BytesIO(response.content))
zipped_sha256 = hashlib.sha256(response.content).hexdigest()
assets = {}
for zipped in zipped_asset.filelist:
if zipped.is_dir():
continue
contents = zipped_asset.read(zipped.filename)
sha256 = hashlib.sha256(contents).hexdigest()
assets[zipped.filename] = {
'contents': contents,
'metadata': {
'compress_size': zipped.compress_size,
# zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday
'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)),
'sha256': sha256,
'size': zipped.file_size,
}
}
return zipped_sha256, assets
def load_json_gh_asset(url: str) -> dict:
"""Load and return the contents of a json asset file."""
response = requests.get(url)
response.raise_for_status()
return response.json()
def download_gh_asset(url: str, path: str, overwrite=False):
"""Download and unzip a GitHub asset."""
zipped = requests.get(url)
z = ZipFile(io.BytesIO(zipped.content))
Path(path).mkdir(exist_ok=True)
if overwrite:
shutil.rmtree(path, ignore_errors=True)
z.extractall(path)
click.echo(f'files saved to {path}')
z.close()
class GithubClient:
"""GitHub client wrapper."""
def __init__(self, token: Optional[str] = None):
"""Get an unauthenticated client, verified authenticated client, or a default client."""
if not Github:
raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`')
self.client: Github = Github(token)
self.unauthenticated_client = Github()
self.__token = token
self.__authenticated_client = None
@property
def authenticated_client(self) -> Github:
if not self.__token:
raise ValueError('Token not defined! Re-instantiate with a token or use add_token method')
if not self.__authenticated_client:
self.__authenticated_client = Github(self.__token)
return self.__authenticated_client
def add_token(self, token):
self.__token = token
@dataclass
class AssetManifestEntry:
compress_size: int
created_at: datetime
name: str
sha256: str
size: int
@dataclass
class AssetManifestMetadata:
relative_url: str
entries: Dict[str, AssetManifestEntry]
zipped_sha256: definitions.Sha256
created_at: datetime = field(default_factory=datetime.utcnow)
description: Optional[str] = None # populated by GitHub release asset label
@dataclass
class ReleaseManifest:
assets: Dict[str, AssetManifestMetadata]
assets_url: str
author: str # parsed from GitHub release metadata as: author[login]
created_at: str
html_url: str
id: int
name: str
published_at: str
url: str
zipball_url: str
tag_name: str = None
description: str = None # parsed from GitHub release metadata as: body
class ManifestManager:
"""Manifest handler for GitHub releases."""
def __init__(self, repo: str = 'elastic/detection-rules', release_name: Optional[str] = None,
tag_name: Optional[str] = None, token: Optional[str] = None):
self.repo_name = repo
self.release_name = release_name
self.tag_name = tag_name
self.gh_client = GithubClient(token)
self.has_token = token is not None
self.repo: Repository = self.gh_client.client.get_repo(repo)
self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name)
if not self.release:
raise ValueError(f'No release found for {tag_name or release_name}')
if not self.release_name:
self.release_name = self.release.title
self.manifest_name = f'manifest-{self.release_name}.json'
self.assets: dict = self._get_enriched_assets_from_release()
self.release_manifest = self._create()
self.__release_manifest_dict = dataclasses.asdict(self.release_manifest)
self.manifest_size = len(json.dumps(self.__release_manifest_dict))
@property
def release_manifest_fl(self) -> io.BytesIO:
return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8'))
def _create(self) -> ReleaseManifest:
"""Create the manifest from GitHub asset metadata and file contents."""
assets = {}
for asset_name, asset_data in self.assets.items():
entries = {}
data = asset_data['data']
metadata = asset_data['metadata']
for file_name, file_data in data.items():
file_metadata = file_data['metadata']
name = Path(file_name).name
file_metadata.update(name=name)
entry = AssetManifestEntry(**file_metadata)
entries[name] = entry
assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries,
metadata['zipped_sha256'], metadata['created_at'],
metadata['label'])
release_metadata = self._parse_release_metadata()
release_metadata.update(assets=assets)
release_manifest = ReleaseManifest(**release_metadata)
return release_manifest
def _parse_release_metadata(self) -> dict:
"""Parse relevant info from GitHub metadata for release manifest."""
ignore = ['assets']
manual_set_keys = ['author', 'description']
keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys]
parsed = {k: self.release.raw_data[k] for k in keys}
parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login'])
return parsed
def save(self) -> GitReleaseAsset:
"""Save manifest files."""
if not self.has_token:
raise ValueError('You must provide a token to save a manifest to a GitHub release')
asset = self.release.upload_asset_from_memory(self.release_manifest_fl,
self.manifest_size,
self.manifest_name)
click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}')
return asset
@classmethod
def load(cls, name: str, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> Optional[dict]:
"""Load a manifest."""
gh_client = GithubClient(token)
repo = gh_client.client.get_repo(repo)
release = get_gh_release(repo, tag_name=name)
for asset in release.get_assets():
if asset.name == f'manifest-{name}.json':
return load_json_gh_asset(asset.browser_download_url)
@classmethod
def load_all(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None
) -> Tuple[Dict[str, dict], list]:
"""Load a consolidated manifest."""
gh_client = GithubClient(token)
repo = gh_client.client.get_repo(repo)
consolidated = {}
missing = set()
for release in repo.get_releases():
name = release.tag_name
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
if not asset:
missing.add(name)
else:
consolidated[name] = load_json_gh_asset(asset.browser_download_url)
return consolidated, list(missing)
@classmethod
def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> dict:
"""Load all assets with their hashes, by release."""
flat = {}
consolidated, _ = cls.load_all(repo=repo, token=token)
for release, data in consolidated.items():
for asset in data['assets'].values():
flat_release = flat[release] = {}
for asset_name, asset_data in asset['entries'].items():
flat_release[asset_name] = asset_data['sha256']
return flat
def _get_enriched_assets_from_release(self) -> dict:
"""Get assets and metadata from a GitHub release."""
assets = {}
for asset in [a.raw_data for a in self.release.get_assets()]:
zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url'])
asset.update(zipped_sha256=zipped_sha256)
assets[asset['name']] = {
'metadata': asset,
'data': data
}
return assets
+1 -298
View File
@@ -4,22 +4,14 @@
# 2.0.
"""Misc support."""
import hashlib
import io
import json
import os
import re
import shutil
import time
import uuid
import dataclasses
from dataclasses import dataclass, field
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import Dict, NoReturn, Tuple
from zipfile import ZipFile
from typing import NoReturn
import click
import requests
@@ -57,295 +49,6 @@ JS_LICENSE = """
""".strip().format("\n".join(' * ' + line for line in LICENSE_LINES))
def get_gh_release(repo: Repository, release_name=None, tag_name=None) -> GitRelease:
"""Get a list of GitHub releases by repo."""
assert release_name or tag_name, 'Must specify a release_name or tag_name'
releases = repo.get_releases()
release = next((r for r in releases
if (release_name and release_name == r.title) or (tag_name and tag_name == r.tag_name)), None)
return release
def upload_gh_release_asset(token, asset: bytes, asset_name, repo=None, content_type='application/zip',
release_name=None, tag_name=None, upload_url=None):
"""Save a Github relase asset."""
if not upload_url:
assert repo, 'You must provide a repo name if not providing an upload_url'
release = get_gh_release(repo, release_name, tag_name)
upload_url = release['upload_url']
suffix = '{?name,label}'
upload_url = upload_url.replace(suffix, f'?name={asset_name}&label={asset_name}')
headers = {'content-type': content_type}
r = requests.post(upload_url, auth=('', token), data=asset, headers=headers)
r.raise_for_status()
click.echo(f'Uploaded {asset_name} to release: {r.json()["url"]}')
def load_zipped_gh_assets_with_metadata(url) -> Tuple[str, dict]:
"""Download and unzip a GitHub assets."""
response = requests.get(url)
zipped_asset = ZipFile(io.BytesIO(response.content))
zipped_sha256 = hashlib.sha256(response.content).hexdigest()
assets = {}
for zipped in zipped_asset.filelist:
if zipped.is_dir():
continue
contents = zipped_asset.read(zipped.filename)
sha256 = hashlib.sha256(contents).hexdigest()
assets[zipped.filename] = {
'contents': contents,
'metadata': {
'compress_size': zipped.compress_size,
# zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday
'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)),
'sha256': sha256,
'size': zipped.file_size,
}
}
return zipped_sha256, assets
def load_json_gh_asset(url) -> dict:
"""Load and return the contents of a json asset file."""
response = requests.get(url)
response.raise_for_status()
return response.json()
def download_gh_asset(url, path, overwrite=False):
"""Download and unzip a GitHub asset."""
zipped = requests.get(url)
z = ZipFile(io.BytesIO(zipped.content))
Path(path).mkdir(exist_ok=True)
if overwrite:
shutil.rmtree(path, ignore_errors=True)
z.extractall(path)
click.echo(f'files saved to {path}')
z.close()
class GithubClient:
"""GitHub client wrapper."""
def __init__(self, token=None):
"""Get an unauthenticated client, verified authenticated client, or a default client."""
if not Github:
raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`')
self.client: Github = Github(token)
self.unauthenticated_client = Github()
self.__token = token
self.__authenticated_client = None
@property
def authenticated_client(self) -> Github:
if not self.__token:
raise ValueError('Token not defined! Re-instantiate with a token or use add_token method')
if not self.__authenticated_client:
self.__authenticated_client = Github(self.__token)
return self.__authenticated_client
def add_token(self, token):
self.__token = token
@dataclass
class AssetManifestEntry:
compress_size: int
created_at: datetime
name: str
sha256: str
size: int
@dataclass
class AssetManifestMetadata:
relative_url: str
entries: Dict[str, AssetManifestEntry]
zipped_sha256: str
created_at: datetime = field(default_factory=datetime.utcnow)
description: str = None # label
@dataclass
class ReleaseManifest:
assets: Dict[str, AssetManifestMetadata]
assets_url: str
author: str # parsed from GitHub release metadata as: author[login]
created_at: str
html_url: str
id: int
name: str
published_at: str
url: str
zipball_url: str
tag_name: str = None
description: str = None # body
class Manifest:
"""Manifest handler for GitHub releases."""
def __init__(self, repo: str = 'elastic/detection-rules', release_name=None, tag_name=None, token=None):
self.repo_name = repo
self.release_name = release_name
self.tag_name = tag_name
self.gh_client = GithubClient(token)
self.has_token = token is not None
self.repo: Repository = self.gh_client.client.get_repo(repo)
self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name)
if not self.release:
raise ValueError(f'No release found for {tag_name or release_name}')
if not self.release_name:
self.release_name = self.release.title
self.manifest_name = f'manifest-{self.release_name}.json'
self.assets: dict = self._get_enriched_assets_from_release()
self.release_manifest = self._create()
self.__release_manifest_dict = dataclasses.asdict(self.release_manifest)
self.manifest_size = len(json.dumps(self.__release_manifest_dict))
@property
def release_manifest_fl(self):
return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8'))
def _create(self):
"""Create the manifest from GitHub asset metadata and file contents."""
assets = {}
for asset_name, asset_data in self.assets.items():
entries = {}
data = asset_data['data']
metadata = asset_data['metadata']
for file_name, file_data in data.items():
file_metadata = file_data['metadata']
name = Path(file_name).name
file_metadata.update(name=name)
entry = AssetManifestEntry(**file_metadata)
entries[name] = entry
assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries,
metadata['zipped_sha256'], metadata['created_at'],
metadata['label'])
release_metadata = self._parse_release_metadata()
release_metadata.update(assets=assets)
release_manifest = ReleaseManifest(**release_metadata)
return release_manifest
def _parse_release_metadata(self):
"""Parse relevant info from GitHub metadata for release manifest."""
ignore = ['assets']
manual_set_keys = ['author', 'description']
keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys]
parsed = {k: self.release.raw_data[k] for k in keys}
parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login'])
return parsed
def save(self) -> GitReleaseAsset:
"""Save manifest files."""
if not self.has_token:
raise ValueError('You must provide a token to save a manifest to a GitHub release')
asset = self.release.upload_asset_from_memory(self.release_manifest_fl,
self.manifest_size,
self.manifest_name)
click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}')
return asset
@classmethod
def load(cls, name: str, repo: str = 'elastic/detection-rules', token=None) -> dict:
"""Load a manifest."""
gh_client = GithubClient(token)
repo = gh_client.client.get_repo(repo)
release = get_gh_release(repo, tag_name=name)
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
if asset is not None:
return load_json_gh_asset(asset.browser_download_url)
@classmethod
def load_all(cls, repo: str = 'elastic/detection-rules', token=None) -> Tuple[Dict[str, dict], list]:
"""Load a consolidated manifest."""
gh_client = GithubClient(token)
repo = gh_client.client.get_repo(repo)
consolidated = {}
missing = set()
for release in repo.get_releases():
name = release.tag_name
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
if not asset:
missing.add(name)
else:
consolidated[name] = load_json_gh_asset(asset.browser_download_url)
return consolidated, list(missing)
@classmethod
def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token=None) -> dict:
"""Load all assets with their hashes, by release."""
flat = {}
consolidated, _ = cls.load_all(repo=repo, token=token)
for release, data in consolidated.items():
for asset in data['assets'].values():
flat_release = flat[release] = {}
for asset_name, asset_data in asset['entries'].items():
flat_release[asset_name] = asset_data['sha256']
return flat
def _get_enriched_assets_from_release(self):
"""Get assets and metadata from a GitHub release."""
assets = {}
for asset in [a.raw_data for a in self.release.get_assets()]:
zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url'])
asset.update(zipped_sha256=zipped_sha256)
assets[asset['name']] = {
'metadata': asset,
'data': data
}
return assets
def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]:
"""Load all ML DGA model release manifests by model id."""
manifests, _ = Manifest.load_all(repo=repo)
model_manifests = {}
for manifest_name, manifest in manifests.items():
for asset_name, asset in manifest['assets'].items():
for entry_name, entry_data in asset['entries'].items():
if entry_name.startswith('dga') and entry_name.endswith('model.json'):
model_id, _ = entry_name.rsplit('_', 1)
model_manifests[model_id] = ReleaseManifest(**manifest)
break
return model_manifests
class ClientError(click.ClickException):
"""Custom CLI error to format output or full debug stacktrace."""
+452
View File
@@ -0,0 +1,452 @@
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
"""Schemas and dataclasses for experimental ML features."""
import io
import zipfile
from dataclasses import dataclass
from functools import cached_property, lru_cache
from pathlib import Path
from typing import Dict, List, Literal, Optional
import click
import elasticsearch
import json
import requests
from eql.table import Table
from elasticsearch import Elasticsearch
from elasticsearch.client import IngestClient, LicenseClient, MlClient
from .eswrap import es_experimental
from .ghwrap import ManifestManager, ReleaseManifest
from .misc import client_error
from .schemas import definitions
from .utils import get_path, unzip_to_dict
ML_PATH = Path(get_path('machine-learning'))
def info_from_tag(tag: str) -> (Literal['ml'], definitions.MachineLearningType, str, int):
try:
ml, release_type, release_date, release_number = tag.split('-')
except ValueError as exc:
raise ValueError(f'{tag} is not of valid release format: ml-type-date-number. {exc}')
return ml, release_type, release_date, int(release_number)
class InvalidLicenseError(Exception):
"""Invalid stack license for ML features requiring platinum or enterprise."""
@dataclass
class MachineLearningClient:
"""Class for experimental machine learning release clients."""
es_client: Elasticsearch
bundle: dict
@cached_property
def model_id(self) -> str:
return next(data['model_id'] for name, data in self.bundle.items() if Path(name).stem.lower().endswith('model'))
@cached_property
def bundle_type(self) -> str:
return self.model_id.split('_')[0].lower()
@cached_property
def ml_client(self) -> MlClient:
return MlClient(self.es_client)
@cached_property
def ingest_client(self) -> IngestClient:
return IngestClient(self.es_client)
@cached_property
def license(self) -> str:
license_client = LicenseClient(self.es_client)
return license_client.get()['license']['type'].lower()
@staticmethod
@lru_cache
def ml_manifests() -> Dict[str, ReleaseManifest]:
return get_ml_model_manifests_by_model_id()
def verify_license(self):
valid_license = self.license in ('platinum', 'enterprise')
if not valid_license:
err_msg = 'Your subscription level does not support Machine Learning. See ' \
'https://www.elastic.co/subscriptions for more information.'
raise InvalidLicenseError(err_msg)
@classmethod
def from_release(cls, es_client: Elasticsearch, release_tag: str,
repo: str = 'elastic/detection-rules') -> 'MachineLearningClient':
"""Load from a GitHub release."""
full_type = '-'.join(info_from_tag(release_tag)[:2])
release_url = f'https://api.github.com/repos/{repo}/releases/tags/{release_tag}'
release = requests.get(release_url)
release.raise_for_status()
# check that the release only has a single zip file
assets = [a for a in release.json()['assets'] if
a['name'].startswith(full_type) and a['name'].endswith('.zip')]
assert len(assets) == 1, f'Malformed release: expected 1 {full_type} zip file, found: {len(assets)}!'
zipped_url = assets[0]['browser_download_url']
zipped_raw = requests.get(zipped_url)
zipped_bundle = zipfile.ZipFile(io.BytesIO(zipped_raw.content))
bundle = unzip_to_dict(zipped_bundle)
return cls(es_client=es_client, bundle=bundle)
@classmethod
def from_directory(cls, es_client: Elasticsearch, directory: Path) -> 'MachineLearningClient':
"""Load from an unzipped local directory."""
bundle = json.loads(directory.read_text())
return cls(es_client=es_client, bundle=bundle)
def remove(self) -> dict:
"""Remove machine learning files from a stack."""
results = dict(script={}, pipeline={}, model={})
for pipeline in list(self.get_related_pipelines()):
results['pipeline'][pipeline] = self.ingest_client.delete_pipeline(pipeline)
for script in list(self.get_related_scripts()):
results['script'][script] = self.es_client.delete_script(script)
results['model'][self.model_id] = self.ml_client.delete_trained_model(self.model_id)
return results
def setup(self) -> dict:
"""Setup machine learning bundle on a stack."""
self.verify_license()
results = dict(script={}, pipeline={}, model={})
# upload in order: model, scripts, then pipelines
parsed_bundle = dict(model={}, script={}, pipeline={})
for filename, data in self.bundle.items():
fp = Path(filename)
file_type = fp.stem.split('_')[-1]
parsed_bundle[file_type][fp.stem] = data
model = list(parsed_bundle['model'].values())[0]
results['model'][model['model_id']] = self.upload_model(model['model_id'], model)
for script_name, script in parsed_bundle['script'].items():
results['script'][script_name] = self.upload_script(script_name, script)
for pipeline_name, pipeline in parsed_bundle['pipeline'].items():
results['pipeline'][pipeline_name] = self.upload_ingest_pipeline(pipeline_name, pipeline)
return results
def get_all_scripts(self) -> Dict[str, dict]:
"""Get all scripts from an elasticsearch instance."""
return self.es_client.cluster.state()['metadata']['stored_scripts']
def get_related_scripts(self) -> Dict[str, dict]:
"""Get all scripts which start with ml_*."""
scripts = self.get_all_scripts()
return {n: s for n, s in scripts.items() if n.lower().startswith(f'ml_{self.bundle_type}')}
def get_related_pipelines(self) -> Dict[str, dict]:
"""Get all pipelines which start with ml_*."""
pipelines = self.ingest_client.get_pipeline()
return {n: s for n, s in pipelines.items() if n.lower().startswith(f'ml_{self.bundle_type}')}
def get_related_model(self) -> Optional[dict]:
"""Get a model from an elasticsearch instance matching the model_id."""
for model in self.get_all_existing_model_files():
if model['model_id'] == self.model_id:
return model
def get_all_existing_model_files(self) -> dict:
"""Get available models from a stack."""
return self.ml_client.get_trained_models()['trained_model_configs']
@classmethod
def get_existing_model_ids(cls, es_client: Elasticsearch) -> List[str]:
"""Get model IDs for existing ML models."""
ml_client = MlClient(es_client)
return [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs']
if m['model_id'] in cls.ml_manifests()]
@classmethod
def check_model_exists(cls, es_client: Elasticsearch, model_id: str) -> bool:
"""Check if a model exists on a stack by model id."""
ml_client = MlClient(es_client)
return model_id in [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs']]
def get_related_files(self) -> dict:
"""Check for the presence and status of ML bundle files on a stack."""
files = {
'pipeline': self.get_related_pipelines(),
'script': self.get_related_scripts(),
'model': self.get_related_model(),
'release': self.get_related_release()
}
return files
def get_related_release(self) -> ReleaseManifest:
"""Get the GitHub release related to a model."""
return self.ml_manifests.get(self.model_id)
@classmethod
def get_all_ml_files(cls, es_client: Elasticsearch) -> dict:
"""Get all scripts, pipelines, and models which start with ml_*."""
pipelines = IngestClient(es_client).get_pipeline()
scripts = es_client.cluster.state()['metadata']['stored_scripts']
models = MlClient(es_client).get_trained_models()['trained_model_configs']
manifests = get_ml_model_manifests_by_model_id()
files = {
'pipeline': {n: s for n, s in pipelines.items() if n.lower().startswith('ml_')},
'script': {n: s for n, s in scripts.items() if n.lower().startswith('ml_')},
'model': {m['model_id']: {'model': m, 'release': manifests[m['model_id']]}
for m in models if m['model_id'] in manifests},
}
return files
@classmethod
def remove_ml_scripts_pipelines(cls, es_client: Elasticsearch, ml_type: List[str]) -> dict:
"""Remove all ML script and pipeline files."""
results = dict(script={}, pipeline={})
ingest_client = IngestClient(es_client)
files = cls.get_all_ml_files(es_client=es_client)
for file_type, data in files.items():
for name in list(data):
this_type = name.split('_')[1].lower()
if this_type not in ml_type:
continue
if file_type == 'script':
results[file_type][name] = es_client.delete_script(name)
elif file_type == 'pipeline':
results[file_type][name] = ingest_client.delete_pipeline(name)
return results
def upload_model(self, model_id: str, body: dict) -> dict:
"""Upload an ML model file."""
return self.ml_client.put_trained_model(model_id=model_id, body=body)
def upload_script(self, script_id: str, body: dict) -> dict:
"""Install a script file."""
return self.es_client.put_script(id=script_id, body=body)
def upload_ingest_pipeline(self, pipeline_id: str, body: dict) -> dict:
"""Install a pipeline file."""
return self.ingest_client.put_pipeline(id=pipeline_id, body=body)
@staticmethod
def _build_script_error(exc: elasticsearch.RequestError, pipeline_file: str):
"""Build an error for a failed script upload."""
error = exc.info['error']
cause = error['caused_by']
error_msg = [
f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
' '.join(f'{k}: {v}' for k, v in error['position'].items()),
'\n'.join(error['script_stack'])
]
return click.style('\n'.join(error_msg), fg='red')
def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]:
"""Load all ML DGA model release manifests by model id."""
manifests, _ = ManifestManager.load_all(repo=repo)
model_manifests = {}
for manifest_name, manifest in manifests.items():
for asset_name, asset in manifest['assets'].items():
for entry_name, entry_data in asset['entries'].items():
if entry_name.startswith('dga') and entry_name.endswith('model.json'):
model_id, _ = entry_name.rsplit('_', 1)
model_manifests[model_id] = ReleaseManifest(**manifest)
break
return model_manifests
@es_experimental.group('ml')
def ml_group():
"""Experimental machine learning commands."""
@ml_group.command('check-files')
@click.pass_context
def check_files(ctx):
"""Check ML model files on an elasticsearch instance."""
files = MachineLearningClient.get_all_ml_files(ctx.obj['es'])
results = []
for file_type, data in files.items():
if file_type == 'model':
continue
for name in list(data):
results.append({'file_type': file_type, 'name': name})
for model_name, model in files['model'].items():
results.append({'file_type': 'model', 'name': model_name, 'related_release': model['release'].tag_name})
fields = ['file_type', 'name', 'related_release']
table = Table.from_list(fields, results)
click.echo(table)
return files
@ml_group.command('remove-model')
@click.argument('model-id', required=False)
@click.pass_context
def remove_model(ctx: click.Context, model_id):
"""Remove ML model files."""
es_client = MlClient(ctx.obj['es'])
model_ids = MachineLearningClient.get_existing_model_ids(ctx.obj['es'])
if not model_id:
model_id = click.prompt('Model ID to remove', type=click.Choice(model_ids))
try:
result = es_client.delete_trained_model(model_id)
except elasticsearch.ConflictError as e:
click.echo(f'{e}: try running `remove-scripts-pipelines` first')
ctx.exit(1)
table = Table.from_list(['model_id', 'status'], [{'model_id': model_id, 'status': result}])
click.echo(table)
return result
@ml_group.command('remove-scripts-pipelines')
@click.option('--dga', is_flag=True)
@click.option('--problemchild', is_flag=True)
@click.pass_context
def remove_scripts_pipelines(ctx: click.Context, **ml_types):
"""Remove ML scripts and pipeline files."""
selected_types = [k for k, v in ml_types.items() if v]
assert selected_types, f'Specify ML types to remove: {list(ml_types)}'
status = MachineLearningClient.remove_ml_scripts_pipelines(es_client=ctx.obj['es'], ml_type=selected_types)
results = []
for file_type, response in status.items():
for name, result in response.items():
results.append({'file_type': file_type, 'name': name, 'status': result})
fields = ['file_type', 'name', 'status']
table = Table.from_list(fields, results)
click.echo(table)
return status
@ml_group.command('setup')
@click.option('--model-tag', '-t',
help='Release tag for model files staged in detection-rules (required to download files)')
@click.option('--repo', '-r', default='elastic/detection-rules',
help='GitHub repository hosting the model file releases (owner/repo)')
@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False),
help='Directory containing local model files')
@click.pass_context
def setup_bundle(ctx, model_tag, repo, model_dir):
"""Upload ML model and dependencies to enrich data."""
es_client: Elasticsearch = ctx.obj['es']
if model_tag:
dga_client = MachineLearningClient.from_release(es_client=es_client, release_tag=model_tag, repo=repo)
elif model_dir:
dga_client = MachineLearningClient.from_directory(es_client=es_client, directory=model_dir)
else:
return client_error('model-tag or model-dir required to download model files')
dga_client.verify_license()
status = dga_client.setup()
results = []
for file_type, response in status.items():
for name, result in response.items():
if file_type == 'model':
status = 'success' if result.get('create_time') else 'potential_failure'
results.append({'file_type': file_type, 'name': name, 'status': status})
continue
results.append({'file_type': file_type, 'name': name, 'status': result})
fields = ['file_type', 'name', 'status']
table = Table.from_list(fields, results)
click.echo(table)
click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo')
click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
click.echo('To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
@ml_group.command('upload-job')
@click.argument('job-file', type=click.Path(exists=True, dir_okay=False))
@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name')
@click.pass_context
def upload_job(ctx: click.Context, job_file, overwrite):
"""Upload experimental ML jobs."""
es_client: Elasticsearch = ctx.obj['es']
ml_client = MlClient(es_client)
with open(job_file, 'r') as f:
job = json.load(f)
def safe_upload(func):
try:
func(name, body)
except (elasticsearch.ConflictError, elasticsearch.RequestError) as err:
if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception':
client_error(str(err), err, ctx=ctx)
if overwrite:
ctx.invoke(delete_job, job_name=name, job_type=job_type)
func(name, body)
else:
client_error(str(err), err, ctx=ctx)
try:
job_type = job['type']
name = job['name']
body = job['body']
if job_type == 'anomaly_detection':
safe_upload(ml_client.put_job)
elif job_type == 'data_frame_analytic':
safe_upload(ml_client.put_data_frame_analytics)
elif job_type == 'datafeed':
safe_upload(ml_client.put_datafeed)
else:
client_error(f'Unknown ML job type: {job_type}')
click.echo(f'Uploaded {job_type} job: {name}')
except KeyError as e:
client_error(f'{job_file} missing required info: {e}')
@ml_group.command('delete-job')
@click.argument('job-name')
@click.argument('job-type')
@click.pass_context
def delete_job(ctx: click.Context, job_name, job_type, verbose=True):
"""Remove experimental ML jobs."""
es_client: Elasticsearch = ctx.obj['es']
ml_client = MlClient(es_client)
try:
if job_type == 'anomaly_detection':
ml_client.delete_job(job_name)
elif job_type == 'data_frame_analytic':
ml_client.delete_data_frame_analytics(job_name)
elif job_type == 'datafeed':
ml_client.delete_datafeed(job_name)
else:
client_error(f'Unknown ML job type: {job_type}')
except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e:
client_error(str(e), e, ctx=ctx)
if verbose:
click.echo(f'Deleted {job_type} job: {job_name}')
+5
View File
@@ -69,3 +69,8 @@ ThresholdValue = NewType("ThresholdValue", int, validate=validate.Range(min=1))
TimelineTemplateId = NewType('TimelineTemplateId', str, validate=validate.OneOf(list(TIMELINE_TEMPLATES)))
TimelineTemplateTitle = NewType('TimelineTemplateTitle', str, validate=validate.OneOf(TIMELINE_TEMPLATES.values()))
UUIDString = NewType('UUIDString', str, validate=validate.Regexp(UUID_PATTERN))
# experimental machine learning features and releases
MachineLearningType = Literal['DGA', 'ProblemChild']
MachineLearningTypeLower = Literal['dga', 'problemchild']
+19
View File
@@ -18,6 +18,7 @@ import zipfile
from dataclasses import is_dataclass, astuple
from datetime import datetime, date
from pathlib import Path
from typing import Dict, Union
import eql.utils
from eql.utils import load_dump, stream_json_lines
@@ -148,6 +149,24 @@ def unzip_and_save(contents, path, member=None, verbose=True):
print('Saved files to {}: \n\t- {}'.format(path, '\n\t- '.join(name_list)))
def unzip_to_dict(zipped: zipfile.ZipFile, load_json=True) -> Dict[str, Union[dict, str]]:
"""Unzip and load contents to dict with filenames as keys."""
bundle = {}
for filename in zipped.namelist():
if filename.endswith('/'):
continue
fp = Path(filename)
contents = zipped.read(filename)
if load_json and fp.suffix == '.json':
contents = json.loads(contents)
bundle[fp.name] = contents
return bundle
def event_sort(events, timestamp='@timestamp', date_format='%Y-%m-%dT%H:%M:%S.%f%z', asc=True):
"""Sort events from elasticsearch by timestamp."""
-191
View File
@@ -1,191 +0,0 @@
# Machine Learning on Domain Generation Algorithm (DGA)
Several blogs were put out on how you can create and leverage supervised DGA ML models to enrich data within the stack.
* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity)
* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data)
You can also find some supplementary and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection)
We also released a blog on getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst):
* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection)
For questions, please reach out to the ML team in the #machine-learning channel of the
[Elastic public slack channel](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack)
They can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning)
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
## Releases
Models and dependencies will be [released](https://github.com/elastic/detection-rules/releases) as `ML-DGA-YYYMMDD-N`.
This tag name is what will need to be passed to the CLI command.
## Uploading a model and dependencies using the CLI
### Usage
```console
python -m detection_rules es experimental setup-dga-model -h
Elasticsearch client:
Options:
-et, --timeout INTEGER Timeout for elasticsearch client
-ep, --es-password TEXT
-eu, --es-user TEXT
--cloud-id TEXT
--elasticsearch-url TEXT
* experimental commands are use at your own risk and may change without warning *
Usage: detection_rules es experimental setup-dga-model [OPTIONS]
Upload ML DGA model and dependencies and enrich DNS data.
Options:
-t, --model-tag TEXT Release tag for model files staged in detection-
rules (required to download files)
-r, --repo TEXT GitHub repository hosting the model file releases
(owner/repo)
-d, --model-dir DIRECTORY Directory containing local model files
--overwrite Overwrite all files if already in the stack
-h, --help Show this message and exit.
```
### Detailed steps
#### 1. Upload and setup the model file and dependencies
Run `python -m detection_rules es <args_or_config> experimental setup-dga-model -t <release-tag>`
*If updating a new model, you should first uninstall any existing models using `remove-dga-model`*
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
expected pattern for the filenames.
#### 2. Update packetbeat configuration
You will need to update your packebeat.yml config file to point to the enrichment pipeline
Under `Elasticsearch Output` add the following:
```yaml
output.elasticsearch:
hosts: ["your-hostname:your-port"]
pipeline: dns_enrich_pipeline
```
#### 3. Refresh your packetbeat index
You can optionally choose to refresh your packetbeat index mapping within Kibana:
* navigate to `Stack Management > (Kibana) Index Patterns`
* select the applicable packetbeat index
* click `refresh field list`
#### 4. Verify enrichment fields
Any packetbeat documents with the field `dns.question.registered_domain` should now have the enriched data:
`ml_is_dga.*`
## Experimental DGA ML Jobs and Rules
Once packetbeat data is being enriched, there are some rules and ML jobs which can leverage the enriched fields.
The experimental rules and jobs will be staged separate from the model bundle under the [releases](https://github.com/elastic/detection-rules/releases)
as `ML-experimental-detections-YYYMMDD-N`. These releases should be considered independent of each other. Any relation
to previously released experimental detections will be mentioned in the accompanying readme (such as an update to a rule).
Note that if a rule is of `type = "machine_learning"`, then it may be dependent on a uploading and running a machine
learning job first. If this is the case, it will likely be annotated within the `note` field of the rule.
#### Uploading rules
You can then individually upload these rules using the [kibana upload-rule](../CLI.md#uploading-rules-to-kibana) command
#### Uploading ML Jobs
Unzip released jobs and then run `python -m detection_rules es <args> experimental upload-ml-job <ml_job.json>`
To delete a job, run `python -m detection_rules es <args> experimental delete-ml-job <job-name> <job-type>`
Take note of any errors as some jobs may have dependencies on each other which may require stopping and or removing
referenced jobs first.
## For Maintainers
### Validating release bundles and releasing
Release assets are expected to be in certain formats with specific naming patterns and json structures.
#### Filename patterns
DGA model file naming convention should match the following patterns
```json
{
"model": "dga_*_model.json",
"dga_ngrams_create": "dga_*_ngrams_create.json",
"dga_ngrams_transform_delete": "dga_*_ngrams_transform_delete.json",
"dns_enrich_pipeline": "dga_*_ingest_pipeline1.json",
"dns_dga_inference_enrich_pipeline": "dga_*_ingest_pipeline2.json"
}
```
Experimental detections do not have to match a specific naming pattern but should be in the following file formats:
* rules: toml
* jobs: json
#### Uniqueness
The model file name and hash should be unique or else it will raise a warning in validation. This is important to allow
distinction and ascertain information about a bundle by consulting the manifest, based on a unique name
Release zipped assets, name, and tag name all share the same name. These should follow the following format:
* Model releases: `ML-DGA-YYYYMMDD-1`
* Detection releases: `ML-experimental-detections-YYYYMMDD-1`
The trailing digit should be incremented for each release
Rule and Job names should also be unique
#### Rule and job structure
Rules files are only check if they are valid toml, nothing more. Consult existing production rules and schemas for API
expectations
Job files are checked if they are valid toml and contain the following top level fields:
* name - job name
* type - job type
* body - the actual ML job data. The contents are not validated
#### Validation
All of these checks are automated and can be called with:
`python -m detection_rules dev gh-release validate-ml-dga-asset` - for model bundles
`python -m detection_rules dev gh-release validate-ml-detections-asset` for rule/job bundles
Pay attention to the output to determine any necessary changes. This may not be all inclusive and actual testing on a
live stack should always occur even with passing validation before saving to a GitHub release
#### Including a readme for detections release
`ML-experimental-detections-*` releases will need to include a readme to provide an overview of the included files
#### Releasing
Install dependencies with `pip install -r requirements-dev.txt`
A release can be created via the cli using `python -m detection_rules dev gh-release create-ml`
* you can only use a github token
* the base directory name and release name must match
* you must have write permissions to the repo to create a release
* validation also occurs on this, with a prompt to proceed
* upon completion, a manifest is uploaded as an asset to the GitHub release
To test, you can fork the repo and use `--repo <your-fork>` to validate a release is working as expected
+54
View File
@@ -0,0 +1,54 @@
# Machine Learning on Domain Generation Algorithm (DGA)
To create and use supervised DGA ML models to enrich data within the stack, check out these Elastic blogs:
* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity)
* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data)
You can also find some supplementary material and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection)
We also released a blog about getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst):
* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection)
For questions, please reach out to the ML team in the #machine-learning channel of the
[Elastic community Slack workspace](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack)
The team can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning)
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
## Detailed steps
#### 1. Upload and setup the model file and dependencies
Run `python -m detection_rules es <args_or_config> experimental ml setup -t <release-tag>`
*If updating a new model, you should first uninstall any existing models using `remove-model`*
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
expected pattern for the filenames.
#### 2. Update packetbeat configuration
You will need to update your packetbeat.yml config file to point to the enrichment pipeline
Under `Elasticsearch Output` add the following:
```yaml
output.elasticsearch:
hosts: ["your-hostname:your-port"]
pipeline: dns_enrich_pipeline
```
#### 3. Refresh your packetbeat index
You can optionally choose to refresh your packetbeat index mapping from within Kibana:
* Navigate to `Stack Management > (Kibana) Index Patterns`
* Select the appropriate packetbeat index
* Click `refresh field list`
#### 4. Verify enrichment fields
Any packetbeat documents with the field `dns.question.registered_domain` should now be enriched with `ml_is_dga.*`
@@ -0,0 +1,28 @@
# Experimental ML Jobs and Rules
The ingest pipeline enriches process events by adding additional fields, which are used to power several rules.
The experimental rules and jobs are staged separately from the model bundle under [releases](https://github.com/elastic/detection-rules/releases), with the tag `ML-experimental-detections-YYYMMDD-N`. New releases with this tag may contain either updates to existing rules or new experimental detcetions.
Note that if a rule is of `type = "machine_learning"`, then it may be dependent on uploading and running a machine
learning job first. If this is the case, it will likely be annotated within the `note` field of the rule.
### Uploading rules
Unzip the release bundle and upload these rules individually.
Rules are now stored in ndjson format and can be imported into Kibana via the security app detections page.
Earlier releases stored the rules in toml format. These can be uploaded using the
[7.12 branch](https://github.com/elastic/detection-rules/tree/7.12) CLI using the
[kibana upload-rule](../../CLI.md#uploading-rules-to-kibana) command
### Uploading ML Jobs and Datafeeds
Unzip the release bundle and then run `python -m detection_rules es <args> experimental ml upload-job <ml_job.json>`
To delete a job/datafeed, run `python -m detection_rules es <args> experimental ml delete-job <job-name> <job-type>`
The CLI automatically identifies whether the provided input file is an ML job or datafeed.
Take note of any errors as the jobs and datafeeds may have dependencies on each other which may require stopping and/or removing
referenced jobs/datafeeds first.
@@ -0,0 +1,64 @@
# ProblemChild in the Elastic Stack
ProblemChild helps detect anomalous activity in Windows process events by:
1) Classifying events as malicious vs benign
2) Identifying anomalous events based on rare parent-child process relationships
An end-to-end blog on how to build the ProblemChild framework from scratch for your environment can be found [here](https://www.elastic.co/blog/problemchild-detecting-living-off-the-land-attacks).
You can also find some supplementary material for the blog and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/ProblemChild)
We also released a blog about getting started with ProblemChild using the CLI and Kibana:
* [ProblemChild Release Blog](link to blog)
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
## Detailed steps
#### 1. Upload and setup the model file and dependencies
Run `python -m detection_rules es <args_or_config> experimental ml setup -t <release-tag>`
*If updating a new model, you should first uninstall any existing models using `remove-model`*
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
expected pattern for the filenames.
#### 2. Update index pipeline configuration
You will need to update your index (containing Windows process event data) settings to point to the ProblemChild enrichment pipeline.
You can do this by running the following command in your Dev Tools console:
```
PUT your-index-pattern/_settings
{
"index": {
"default_pipeline": "ML_ProblemChild_ingest_pipeline"
}
}
```
If you wish to stop enriching your documents using ProblemChild, run the following command in your dev Tools console:
```
PUT your-index-pattern/_settings
{
"index": {
"default_pipeline": null
}
}
```
#### 3. Refresh your indexes
You can optionally choose to refresh your index mapping from within Kibana:
* Navigate to `Stack Management > (Kibana) Index Patterns`
* Select the appropriate indexes
* Click `refresh field list`
#### 4. Verify enrichment fields
Any documents corresponding to Windows process events should now be enriched with `problemchild.*`
@@ -0,0 +1,102 @@
# Experimental machine learning
This repo contains some additional information and files to use experimental[*](#what-does-experimental-mean-in-this-context) machine learning features and detections
## Features
* [DGA](DGA.md)
* [ProblemChild](problem_child.md)
* [experimental detections](experimental-detections.md)
## Releases
There are separate [releases](https://github.com/elastic/detection-rules/releases) for:
* DGA: `ML-DGA-*`
* problem child: `ML-ProblemChild-*`
* experimental detections: `ML-experimental-detections-*`
Releases will use the tag `ML-TYPE-YYYMMDD-N`, which will be needed for uploading the model using the CLI.
## CLI
Support commands can be found under `python -m detection_rules es <es args> experimental ml -h`
```console
Elasticsearch client:
Options:
-et, --timeout INTEGER Timeout for elasticsearch client
-ep, --es-password TEXT
-eu, --es-user TEXT
--elasticsearch-url TEXT
--cloud-id TEXT
* experimental commands are use at your own risk and may change without warning *
Usage: detection_rules es experimental ml [OPTIONS] COMMAND [ARGS]...
Experimental machine learning commands.
Options:
-h, --help Show this message and exit.
Commands:
check-files Check ML model files on an elasticsearch...
delete-job Remove experimental ML jobs.
remove-model Remove ML model files.
remove-scripts-pipelines Remove ML scripts and pipeline files.
setup Upload ML model and dependencies to enrich data.
upload-job Upload experimental ML jobs.
```
## Managing a model and dependencies using the CLI
### Installing
```console
python -m detection_rules es experimental ml setup -h
Elasticsearch client:
Options:
-et, --timeout INTEGER Timeout for elasticsearch client
-ep, --es-password TEXT
-eu, --es-user TEXT
--cloud-id TEXT
--elasticsearch-url TEXT
* experimental commands are use at your own risk and may change without warning *
Usage: detection_rules es experimental ml setup [OPTIONS]
Upload ML model and dependencies to enrich data.
Options:
-t, --model-tag TEXT Release tag for model files staged in detection-
rules (required to download files)
-r, --repo TEXT GitHub repository hosting the model file releases
(owner/repo)
-d, --model-dir DIRECTORY Directory containing local model files
--overwrite Overwrite all files if already in the stack
-h, --help Show this message and exit.
```
### Removing
To remove the ML bundle, you will need to remove the pipelines and scripts first and then the model.
You can do this by running:
* `python -m detection_rules es experimental ml remove-pipeline-scripts --dga --problemchild`
* `python -m detection_rules es experimental ml remove-model <model-id>`
----
##### What does experimental mean in this context?
Experimental model bundles (models, scripts, and pipelines), rules, and jobs are components which are currently in
development and so may not have completed the testing or scrutiny which full production detections are subjected to.
It may also make use of features which are not yet GA and so may be subject to change and are not covered by the support
SLA of general release (GA) features. Some of these features may also never make it to GA.