Refactor experimental ML CLI and code (#1218)
* move github and ml to their own files * refactor release and ml commands * update ML readmes * add unzip_to_dict function * prompt for model ID in remove-model * update experimental rule upload process * update remove-scripts-pipelines to take multiple options Co-authored-by: Ross Wolf <31489089+rw-access@users.noreply.github.com> Co-authored-by: Apoorva <appujo@gmail.com>
This commit is contained in:
@@ -107,9 +107,11 @@ ENV/
|
||||
/_extras/
|
||||
|
||||
# detection rules
|
||||
.detection-rules-cfg.json
|
||||
releases/
|
||||
collections/
|
||||
enriched-rule-indexes/
|
||||
exports/
|
||||
ML-models/
|
||||
surveys/
|
||||
machine-learning/
|
||||
|
||||
@@ -12,9 +12,11 @@ from . import ( # noqa: E402
|
||||
devtools,
|
||||
docs,
|
||||
eswrap,
|
||||
ghwrap,
|
||||
kbwrap,
|
||||
main,
|
||||
mappings,
|
||||
ml,
|
||||
misc,
|
||||
rule_formatter,
|
||||
rule_loader,
|
||||
@@ -26,10 +28,12 @@ __all__ = (
|
||||
'devtools',
|
||||
'docs',
|
||||
'eswrap',
|
||||
'ghwrap',
|
||||
'kbwrap',
|
||||
'mappings',
|
||||
"main",
|
||||
'misc',
|
||||
'ml',
|
||||
'rule_formatter',
|
||||
'rule_loader',
|
||||
'schemas',
|
||||
|
||||
@@ -67,14 +67,13 @@ def multi_collection(f):
|
||||
@click.option('--rule-id', '-id', multiple=True, required=False)
|
||||
@functools.wraps(f)
|
||||
def get_collection(*args, **kwargs):
|
||||
rule_name: List[str] = kwargs.pop("rule_name", [])
|
||||
rule_id: List[str] = kwargs.pop("rule_id", [])
|
||||
rule_files: List[str] = kwargs.pop("rule_file")
|
||||
directories: List[str] = kwargs.pop("directory")
|
||||
|
||||
rules = RuleCollection()
|
||||
|
||||
if not (rule_name or rule_id or rule_files):
|
||||
if not (directories or rule_id or rule_files):
|
||||
client_error('Required: at least one of --rule-id, --rule-file, or --directory')
|
||||
|
||||
rules.load_files(Path(p) for p in rule_files)
|
||||
|
||||
+11
-236
@@ -6,7 +6,6 @@
|
||||
"""CLI commands for internal detection_rules dev team."""
|
||||
import dataclasses
|
||||
import functools
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
@@ -25,10 +24,11 @@ from kibana.connector import Kibana
|
||||
from . import rule_loader
|
||||
from .cli_utils import single_collection
|
||||
from .eswrap import CollectEvents, add_range_to_dsl
|
||||
from .ghwrap import GithubClient
|
||||
from .main import root
|
||||
from .misc import GithubClient, Manifest, PYTHON_LICENSE, add_client, client_error, getdefault
|
||||
from .misc import PYTHON_LICENSE, add_client, client_error
|
||||
from .packaging import PACKAGE_FILE, Package, RELEASE_DIR, current_stack_version, manage_versions
|
||||
from .rule import QueryRuleData, TOMLRule
|
||||
from .rule import AnyRuleData, BaseRuleData, QueryRuleData, TOMLRule
|
||||
from .rule_loader import RuleCollection, production_filter
|
||||
from .utils import dict_hash, get_path, load_dump
|
||||
|
||||
@@ -417,6 +417,14 @@ def deprecate_rule(ctx: click.Context, rule_file: str):
|
||||
click.echo(f'Rule moved to {deprecated_path} - remember to git add this file')
|
||||
|
||||
|
||||
@dev_group.command("update-schemas")
|
||||
def update_schemas():
|
||||
classes = [BaseRuleData] + list(typing.get_args(AnyRuleData))
|
||||
|
||||
for cls in classes:
|
||||
cls.save_schema()
|
||||
|
||||
|
||||
@dev_group.group('test')
|
||||
def test_group():
|
||||
"""Commands for testing against stack resources."""
|
||||
@@ -557,236 +565,3 @@ def rule_survey(ctx: click.Context, query, date_range, dump_file, hide_zero_coun
|
||||
json.dump(details, f, indent=2, sort_keys=True)
|
||||
|
||||
return survey_results
|
||||
|
||||
|
||||
@dev_group.group('gh-release')
|
||||
def gh_release_group():
|
||||
"""Commands to manage GitHub releases."""
|
||||
|
||||
|
||||
@gh_release_group.command('create-ml')
|
||||
@click.argument('directory', type=click.Path(dir_okay=True, file_okay=False))
|
||||
@click.option('--gh-token', '-t', default=getdefault('gh_token'))
|
||||
@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo')
|
||||
@click.option('--release-name', '-n', required=True, help='Name of release')
|
||||
@click.option('--description', '-d', help='Description of release to append to default message')
|
||||
@click.pass_context
|
||||
def create_ml_release(ctx, directory, gh_token, repo, release_name, description):
|
||||
"""Create a GitHub release."""
|
||||
import re
|
||||
|
||||
# ML-DGA-20201129-25
|
||||
pattern = r'^(ML-DGA|ML-experimental-detections)-\d{4}\d{2}\d{2}-\d+$'
|
||||
assert re.match(pattern, release_name), f'release name must match pattern: {pattern}'
|
||||
assert Path(directory).name == release_name, f'directory name must match release name: {release_name}'
|
||||
|
||||
gh_token = gh_token or click.prompt('GitHub token', hide_input=True)
|
||||
client = GithubClient(gh_token)
|
||||
gh_repo = client.authenticated_client.get_repo(repo)
|
||||
|
||||
# validate tag name is increment by 1
|
||||
name_prefix, _, version = release_name.rsplit('-', 2)
|
||||
version = int(version)
|
||||
releases = gh_repo.get_releases()
|
||||
max_ver = max([int(r.raw_data['name'].split('-')[-1]) for r in releases
|
||||
if r.raw_data['name'].startswith(name_prefix)], default=0)
|
||||
|
||||
if version != (max_ver + 1):
|
||||
client_error(f'Last release version was {max_ver}. Release name should end with version: {max_ver + 1}')
|
||||
|
||||
# validate files
|
||||
if name_prefix == 'ML-DGA':
|
||||
zipped_bundle, description_str = ctx.invoke(validate_ml_dga_asset, directory=directory, repo=repo)
|
||||
else:
|
||||
zipped_bundle, description_str = ctx.invoke(validate_ml_detections_asset, directory=directory)
|
||||
|
||||
click.confirm('Validation passed, verify output. Continue?')
|
||||
|
||||
if description:
|
||||
description_str = f'{description_str}\n\n----\n\n{description}'
|
||||
|
||||
release = gh_repo.create_git_release(name=release_name, tag=release_name, message=description_str)
|
||||
zip_name = Path(zipped_bundle).name
|
||||
|
||||
click.echo(f'release created at: {release.html_url}')
|
||||
|
||||
# add zipped bundle as an asset to the release
|
||||
click.echo(f'Uploading zip file: {zip_name}')
|
||||
release.upload_asset(zipped_bundle, label=zip_name, name=zip_name, content_type='application/zip')
|
||||
|
||||
# create manifest entry
|
||||
click.echo('creating manifest for release')
|
||||
manifest = Manifest(repo, tag_name=release_name, token=gh_token)
|
||||
manifest.save()
|
||||
|
||||
return release
|
||||
|
||||
|
||||
@gh_release_group.command('validate-ml-dga-asset')
|
||||
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
|
||||
@click.option('--repo', '-r', default='elastic/detection-rules', help='GitHub owner/repo')
|
||||
def validate_ml_dga_asset(directory, repo):
|
||||
""""Validate and prep an ML DGA bundle for release."""
|
||||
from .eswrap import expected_ml_dga_patterns
|
||||
|
||||
now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||
|
||||
files = list(Path(directory).glob('*'))
|
||||
if len(files) > 5:
|
||||
client_error('Too many files, expected 5')
|
||||
|
||||
click.secho('[*] validated expected number of files', fg='green')
|
||||
|
||||
# backup files - will re-save sorted to have deterministic hash
|
||||
backup_path = Path(directory).resolve().parent.joinpath(f'backups-{Path(directory).name}-{now.replace(":", "-")}')
|
||||
shutil.copytree(directory, backup_path)
|
||||
|
||||
# validate file names and json and load
|
||||
loaded_contents = {}
|
||||
for name, pattern in expected_ml_dga_patterns.items():
|
||||
path = list(Path(directory).glob(pattern))
|
||||
match_count = len(path)
|
||||
if match_count != 1:
|
||||
client_error(f'Expected filename pattern "{pattern}" for "{name}": {match_count} matches detected')
|
||||
|
||||
file_path = path[0]
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
contents = json.dumps(json.load(f), sort_keys=True)
|
||||
loaded_contents[name] = {'contents': contents, 'filename': file_path}
|
||||
|
||||
sha256 = hashlib.sha256(contents.encode('utf-8')).hexdigest()
|
||||
click.secho(f' - sha256: {sha256} - {name}')
|
||||
|
||||
# re-save sorted
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(contents)
|
||||
except json.JSONDecodeError as e:
|
||||
client_error(f'Invalid JSON in {file_path} file', e)
|
||||
|
||||
model_filename = Path(loaded_contents['model']['filename']).name
|
||||
model_name, _ = model_filename.rsplit('_', maxsplit=1)
|
||||
|
||||
click.secho('[*] re-saved all files with keys sorted for deterministic hashing', fg='green')
|
||||
click.secho(f' [+] backups saved to: {backup_path}')
|
||||
click.secho('[*] validated expected naming patterns for all files', fg='green')
|
||||
click.secho('[*] validated json formatting of all files', fg='green')
|
||||
|
||||
# check manifest for existing things
|
||||
existing_sha = False
|
||||
existing_model_name = False
|
||||
model_hash = hashlib.sha256(loaded_contents['model']['contents'].encode('utf-8')).hexdigest()
|
||||
manifest_hashes = Manifest.get_existing_asset_hashes(repo)
|
||||
|
||||
for release, file_data in manifest_hashes.items():
|
||||
for file_name, sha in file_data.items():
|
||||
if model_hash == sha:
|
||||
existing_sha = True
|
||||
click.secho(f'[!] hash for model file: "{loaded_contents["model"]["filename"]}" matches: '
|
||||
f'{release} -> {file_name} -> {sha}', fg='yellow')
|
||||
|
||||
if model_filename == file_name:
|
||||
existing_model_name = True
|
||||
client_error(f'name for model file: "{loaded_contents["model"]["filename"]}" matches: '
|
||||
f'{release} -> {file_name} -> {file_name}')
|
||||
|
||||
if not existing_sha:
|
||||
click.secho(f'[+] validated no existing models matched hashes for: '
|
||||
f'{loaded_contents["model"]["filename"]}', fg='green')
|
||||
|
||||
if not existing_model_name:
|
||||
click.secho(f'[+] validated no existing models matched names for: '
|
||||
f'{loaded_contents["model"]["filename"]}', fg='green')
|
||||
|
||||
# save zip
|
||||
zip_name_no_ext = Path(directory).resolve()
|
||||
zip_name = f'{zip_name_no_ext}.zip'
|
||||
shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name)
|
||||
click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green')
|
||||
|
||||
click.secho(f'[!] run `setup-dga-model -d {directory}` to test this on a live stack before releasing', fg='yellow')
|
||||
|
||||
description = {
|
||||
'model_name': model_name + '\n\n----\n\n',
|
||||
'date': now,
|
||||
'model_sha256': model_hash,
|
||||
'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md'
|
||||
}
|
||||
description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()])
|
||||
click.echo()
|
||||
click.echo(f'[*] description to paste with release:\n\n{description_str}\n')
|
||||
|
||||
return zip_name, description_str
|
||||
|
||||
|
||||
@gh_release_group.command('validate-ml-detections-asset')
|
||||
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
|
||||
def validate_ml_detections_asset(directory):
|
||||
"""Validate and prep ML detection rules and jobs before release."""
|
||||
import pytoml
|
||||
|
||||
now = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||
|
||||
all_files = list(Path(directory).glob('*'))
|
||||
job_paths = [f for f in all_files if f.suffix == '.json']
|
||||
rule_paths = [f for f in all_files if f.suffix == '.toml']
|
||||
other_paths = [f for f in Path(directory).glob('*') if f.suffix not in ('.toml', '.json')]
|
||||
job_count = len(job_paths)
|
||||
rule_count = len(rule_paths)
|
||||
other_count = len(other_paths)
|
||||
|
||||
if 'readme.md' not in [f.name.lower() for f in other_paths]:
|
||||
client_error('Release is missing readme file')
|
||||
|
||||
for job in job_paths:
|
||||
try:
|
||||
with open(job, 'r') as f:
|
||||
j = json.load(f)
|
||||
assert j.get('name'), click.style(f'[!] job file "{job}" missing: name', fg='red')
|
||||
assert j.get('type'), click.style(f'[!] job file "{job}" missing: type', fg='red')
|
||||
assert j.get('body'), click.style(f'[!] job file "{job}" missing: body', fg='red')
|
||||
except json.JSONDecodeError as e:
|
||||
client_error(f'Invalid JSON in {job} file', e)
|
||||
|
||||
click.secho(f'[*] validated json formatting and required fields in {job_count} job files', fg='green')
|
||||
|
||||
for rule in rule_paths:
|
||||
with open(rule, 'r') as f:
|
||||
try:
|
||||
pytoml.load(f)
|
||||
except pytoml.TomlError as e:
|
||||
client_error(f'[!] invalid rule toml for: {rule}', e)
|
||||
|
||||
click.secho(f'[*] validated toml formatting for {rule_count} rule files', fg='green')
|
||||
|
||||
# save zip
|
||||
zip_name_no_ext = Path(directory).resolve()
|
||||
zip_name = f'{zip_name_no_ext}.zip'
|
||||
shutil.make_archive(str(zip_name_no_ext), 'zip', root_dir=zip_name_no_ext.parent, base_dir=zip_name_no_ext.name)
|
||||
click.secho(f'[+] zipped folder saved to {zip_name} for release', fg='green')
|
||||
|
||||
click.secho('[!] run `kibana upload-rule` to test rules on a live stack before releasing', fg='green')
|
||||
click.secho('[!] run `es upload-ml-job` to test jobs on a live stack before releasing', fg='green')
|
||||
|
||||
description = {
|
||||
'Experimental rules': rule_count,
|
||||
'Experimental ML jobs': job_count,
|
||||
'Other files': str(other_count) + '\n\n----\n\n',
|
||||
'DGA release': '<add link to DGA release these detections were built on>',
|
||||
'date': now,
|
||||
'For details reference': 'https://github.com/elastic/detection-rules/blob/main/docs/ML_DGA.md'
|
||||
}
|
||||
description_str = '\n'.join([f'{k}: {v}' for k, v in description.items()])
|
||||
click.echo()
|
||||
click.echo(f'description to paste with release:\n\n{description_str}\n')
|
||||
|
||||
return zip_name, description_str
|
||||
|
||||
|
||||
@dev_group.command("update-schemas")
|
||||
def update_schemas():
|
||||
from .rule import BaseRuleData, AnyRuleData
|
||||
classes = [BaseRuleData] + list(typing.get_args(AnyRuleData))
|
||||
|
||||
for cls in classes:
|
||||
cls.save_schema()
|
||||
|
||||
+1
-317
@@ -8,14 +8,12 @@ import json
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import click
|
||||
import elasticsearch
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.client import AsyncSearchClient, IngestClient, LicenseClient, MlClient
|
||||
from elasticsearch.client import AsyncSearchClient
|
||||
|
||||
import kql
|
||||
from .main import root
|
||||
@@ -422,317 +420,3 @@ def index_repo(ctx: click.Context, query, from_file, save_files):
|
||||
def es_experimental():
|
||||
"""[Experimental] helper commands for integrating with Elasticsearch."""
|
||||
click.secho('\n* experimental commands are use at your own risk and may change without warning *\n')
|
||||
|
||||
|
||||
@es_experimental.command('check-model-files')
|
||||
@click.pass_context
|
||||
def check_model_files(ctx):
|
||||
"""Check ML model files on an elasticsearch instance."""
|
||||
from elasticsearch.client import IngestClient, MlClient
|
||||
from .misc import get_ml_model_manifests_by_model_id
|
||||
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
ml_client = MlClient(es_client)
|
||||
ingest_client = IngestClient(es_client)
|
||||
|
||||
def safe_get(func, arg):
|
||||
try:
|
||||
return func(arg)
|
||||
except elasticsearch.NotFoundError:
|
||||
return None
|
||||
|
||||
models = [m for m in ml_client.get_trained_models().get('trained_model_configs', [])
|
||||
if m['created_by'] != '_xpack']
|
||||
|
||||
if models:
|
||||
if len([m for m in models if m['model_id'].startswith('dga_')]) > 1:
|
||||
click.secho('Multiple DGA models detected! It is not recommended to run more than one DGA model at a time',
|
||||
fg='yellow')
|
||||
|
||||
manifests = get_ml_model_manifests_by_model_id()
|
||||
|
||||
click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:')
|
||||
for model in models:
|
||||
manifest = manifests.get(model['model_id'])
|
||||
click.echo(f' - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}')
|
||||
else:
|
||||
click.echo('No DGA Models found')
|
||||
|
||||
support_files = {
|
||||
'create_script': safe_get(es_client.get_script, 'dga_ngrams_create'),
|
||||
'delete_script': safe_get(es_client.get_script, 'dga_ngrams_transform_delete'),
|
||||
'enrich_pipeline': safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'),
|
||||
'inference_pipeline': safe_get(ingest_client.get_pipeline, 'dns_dga_inference_enrich_pipeline')
|
||||
}
|
||||
|
||||
click.echo('Support Files:')
|
||||
for support_file, results in support_files.items():
|
||||
click.echo(f' - {support_file}: {"found" if results else "not found"}')
|
||||
|
||||
|
||||
@es_experimental.command('remove-dga-model')
|
||||
@click.argument('model-id')
|
||||
@click.option('--force', '-f', is_flag=True, help='Force the attempted delete without checking if model exists')
|
||||
@click.pass_context
|
||||
def remove_dga_model(ctx, model_id, force, es_client: Elasticsearch = None, ml_client: MlClient = None,
|
||||
ingest_client: IngestClient = None):
|
||||
"""Remove ML DGA files."""
|
||||
from elasticsearch.client import IngestClient, MlClient
|
||||
|
||||
es_client = es_client or ctx.obj['es']
|
||||
ml_client = ml_client or MlClient(es_client)
|
||||
ingest_client = ingest_client or IngestClient(es_client)
|
||||
|
||||
def safe_delete(func, fid, verbose=True):
|
||||
try:
|
||||
func(fid)
|
||||
except elasticsearch.NotFoundError:
|
||||
return False
|
||||
if verbose:
|
||||
click.echo(f' - {fid} deleted')
|
||||
return True
|
||||
|
||||
model_exists = False
|
||||
if not force:
|
||||
existing_models = ml_client.get_trained_models()
|
||||
model_exists = model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]
|
||||
|
||||
if model_exists or force:
|
||||
if model_exists:
|
||||
click.secho('[-] Existing model detected - deleting files', fg='yellow')
|
||||
|
||||
deleted = [
|
||||
safe_delete(ingest_client.delete_pipeline, 'dns_dga_inference_enrich_pipeline'),
|
||||
safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'),
|
||||
safe_delete(es_client.delete_script, 'dga_ngrams_transform_delete'),
|
||||
# f'{model_id}_dga_ngrams_transform_delete'
|
||||
safe_delete(es_client.delete_script, 'dga_ngrams_create'),
|
||||
# f'{model_id}_dga_ngrams_create'
|
||||
safe_delete(ml_client.delete_trained_model, model_id)
|
||||
]
|
||||
|
||||
if not any(deleted):
|
||||
click.echo('No files deleted')
|
||||
else:
|
||||
click.echo(f'Model: {model_id} not found')
|
||||
|
||||
|
||||
expected_ml_dga_patterns = {
|
||||
'model': 'dga_*_model.json', # noqa: E241
|
||||
'dga_ngrams_create': 'dga_*_ngrams_create.json', # noqa: E241
|
||||
'dga_ngrams_transform_delete': 'dga_*_ngrams_transform_delete.json', # noqa: E241
|
||||
'dns_enrich_pipeline': 'dga_*_ingest_pipeline1.json', # noqa: E241
|
||||
'dns_dga_inference_enrich_pipeline': 'dga_*_ingest_pipeline2.json' # noqa: E241
|
||||
}
|
||||
|
||||
|
||||
@es_experimental.command('setup-dga-model')
|
||||
@click.option('--model-tag', '-t',
|
||||
help='Release tag for model files staged in detection-rules (required to download files)')
|
||||
@click.option('--repo', '-r', default='elastic/detection-rules',
|
||||
help='GitHub repository hosting the model file releases (owner/repo)')
|
||||
@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False),
|
||||
help='Directory containing local model files')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite all files if already in the stack')
|
||||
@click.pass_context
|
||||
def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite):
|
||||
"""Upload ML DGA model and dependencies and enrich DNS data."""
|
||||
import io
|
||||
import requests
|
||||
import shutil
|
||||
import zipfile
|
||||
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
client_info = es_client.info()
|
||||
license_client = LicenseClient(es_client)
|
||||
|
||||
if license_client.get()['license']['type'].lower() not in ('platinum', 'enterprise'):
|
||||
client_error('You must have a platinum or enterprise subscription in order to use these ML features')
|
||||
|
||||
# download files if necessary
|
||||
if not model_dir:
|
||||
if not model_tag:
|
||||
client_error('model-tag or model-dir required to download model files')
|
||||
|
||||
click.echo(f'Downloading artifact: {model_tag}')
|
||||
|
||||
release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}'
|
||||
release = requests.get(release_url)
|
||||
release.raise_for_status()
|
||||
assets = [a for a in release.json()['assets'] if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')]
|
||||
|
||||
if len(assets) != 1:
|
||||
client_error(f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!')
|
||||
|
||||
zipped_url = assets[0]['browser_download_url']
|
||||
zipped = requests.get(zipped_url)
|
||||
z = zipfile.ZipFile(io.BytesIO(zipped.content))
|
||||
|
||||
dga_dir = get_path('ML-models', 'DGA')
|
||||
model_dir = os.path.join(dga_dir, model_tag)
|
||||
os.makedirs(dga_dir, exist_ok=True)
|
||||
shutil.rmtree(model_dir, ignore_errors=True)
|
||||
z.extractall(dga_dir)
|
||||
click.echo(f'files saved to {model_dir}')
|
||||
|
||||
# read files as needed
|
||||
z.close()
|
||||
|
||||
def get_model_filename(pattern):
|
||||
paths = list(Path(model_dir).glob(pattern))
|
||||
if not paths:
|
||||
client_error(f'{model_dir} missing files matching the pattern: {pattern}')
|
||||
if len(paths) > 1:
|
||||
client_error(f'{model_dir} contains multiple files matching the pattern: {pattern}')
|
||||
|
||||
return paths[0]
|
||||
|
||||
@contextmanager
|
||||
def open_model_file(name):
|
||||
pattern = expected_ml_dga_patterns[name]
|
||||
with open(get_model_filename(pattern), 'r') as f:
|
||||
yield json.load(f)
|
||||
|
||||
model_id, _ = os.path.basename(get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1)
|
||||
|
||||
click.echo(f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})')
|
||||
|
||||
# upload model
|
||||
ml_client = MlClient(es_client)
|
||||
ingest_client = IngestClient(es_client)
|
||||
|
||||
existing_models = ml_client.get_trained_models()
|
||||
if model_id in [m['model_id'] for m in existing_models.get('trained_model_configs', [])]:
|
||||
if overwrite:
|
||||
ctx.invoke(remove_dga_model, model_id=model_id, es_client=es_client, ml_client=ml_client,
|
||||
ingest_client=ingest_client, force=True)
|
||||
else:
|
||||
client_error(f'Model: {model_id} already exists on stack! Try --overwrite to force the upload')
|
||||
|
||||
click.secho('[+] Uploading model (may take a while)')
|
||||
|
||||
with open_model_file('model') as model_file:
|
||||
try:
|
||||
ml_client.put_trained_model(model_id=model_id, body=model_file)
|
||||
except elasticsearch.ConnectionTimeout:
|
||||
msg = 'Connection timeout, try increasing timeout using `es --timeout <secs> experimental setup_dga_model`.'
|
||||
client_error(msg)
|
||||
|
||||
# install scripts
|
||||
click.secho('[+] Uploading painless scripts')
|
||||
|
||||
with open_model_file('dga_ngrams_create') as painless_install:
|
||||
es_client.put_script(id='dga_ngrams_create', body=painless_install)
|
||||
# f'{model_id}_dga_ngrams_create'
|
||||
|
||||
with open_model_file('dga_ngrams_transform_delete') as painless_delete:
|
||||
es_client.put_script(id='dga_ngrams_transform_delete', body=painless_delete)
|
||||
# f'{model_id}_dga_ngrams_transform_delete'
|
||||
|
||||
# Install ingest pipelines
|
||||
click.secho('[+] Uploading pipelines')
|
||||
|
||||
def _build_es_script_error(err, pipeline_file):
|
||||
error = err.info['error']
|
||||
cause = error['caused_by']
|
||||
|
||||
error_msg = [
|
||||
f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
|
||||
' '.join(f'{k}: {v}' for k, v in error['position'].items()),
|
||||
'\n'.join(error['script_stack'])
|
||||
]
|
||||
|
||||
return click.style('\n'.join(error_msg), fg='red')
|
||||
|
||||
with open_model_file('dns_enrich_pipeline') as ingest_pipeline1:
|
||||
try:
|
||||
ingest_client.put_pipeline(id='dns_enrich_pipeline', body=ingest_pipeline1)
|
||||
except elasticsearch.RequestError as e:
|
||||
if e.error == 'script_exception':
|
||||
client_error(_build_es_script_error(e, 'ingest_pipeline1'), e, ctx=ctx)
|
||||
else:
|
||||
raise
|
||||
|
||||
with open_model_file('dns_dga_inference_enrich_pipeline') as ingest_pipeline2:
|
||||
try:
|
||||
ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline', body=ingest_pipeline2)
|
||||
except elasticsearch.RequestError as e:
|
||||
if e.error == 'script_exception':
|
||||
client_error(_build_es_script_error(e, 'ingest_pipeline2'), e, ctx=ctx)
|
||||
else:
|
||||
raise
|
||||
|
||||
click.echo('Ensure that you have updated your packetbeat.yml config file.')
|
||||
click.echo(' - reference: ML_DGA.md #2-update-packetbeat-configuration')
|
||||
click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo')
|
||||
click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
|
||||
click.echo('To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
|
||||
|
||||
|
||||
@es_experimental.command('upload-ml-job')
|
||||
@click.argument('job-file', type=click.Path(exists=True, dir_okay=False))
|
||||
@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name')
|
||||
@click.pass_context
|
||||
def upload_ml_job(ctx: click.Context, job_file, overwrite):
|
||||
"""Upload experimental ML jobs."""
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
ml_client = MlClient(es_client)
|
||||
|
||||
with open(job_file, 'r') as f:
|
||||
job = json.load(f)
|
||||
|
||||
def safe_upload(func):
|
||||
try:
|
||||
func(name, body)
|
||||
except (elasticsearch.ConflictError, elasticsearch.RequestError) as err:
|
||||
if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception':
|
||||
client_error(str(err), err, ctx=ctx)
|
||||
|
||||
if overwrite:
|
||||
ctx.invoke(delete_ml_job, job_name=name, job_type=job_type)
|
||||
func(name, body)
|
||||
else:
|
||||
client_error(str(err), err, ctx=ctx)
|
||||
|
||||
try:
|
||||
job_type = job['type']
|
||||
name = job['name']
|
||||
body = job['body']
|
||||
|
||||
if job_type == 'anomaly_detection':
|
||||
safe_upload(ml_client.put_job)
|
||||
elif job_type == 'data_frame_analytic':
|
||||
safe_upload(ml_client.put_data_frame_analytics)
|
||||
elif job_type == 'datafeed':
|
||||
safe_upload(ml_client.put_datafeed)
|
||||
else:
|
||||
client_error(f'Unknown ML job type: {job_type}')
|
||||
|
||||
click.echo(f'Uploaded {job_type} job: {name}')
|
||||
except KeyError as e:
|
||||
client_error(f'{job_file} missing required info: {e}')
|
||||
|
||||
|
||||
@es_experimental.command('delete-ml-job')
|
||||
@click.argument('job-name')
|
||||
@click.argument('job-type')
|
||||
@click.pass_context
|
||||
def delete_ml_job(ctx: click.Context, job_name, job_type, verbose=True):
|
||||
"""Remove experimental ML jobs."""
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
ml_client = MlClient(es_client)
|
||||
|
||||
try:
|
||||
if job_type == 'anomaly_detection':
|
||||
ml_client.delete_job(job_name)
|
||||
elif job_type == 'data_frame_analytic':
|
||||
ml_client.delete_data_frame_analytics(job_name)
|
||||
elif job_type == 'datafeed':
|
||||
ml_client.delete_datafeed(job_name)
|
||||
else:
|
||||
client_error(f'Unknown ML job type: {job_type}')
|
||||
except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e:
|
||||
client_error(str(e), e, ctx=ctx)
|
||||
|
||||
if verbose:
|
||||
click.echo(f'Deleted {job_type} job: {job_name}')
|
||||
|
||||
@@ -0,0 +1,295 @@
|
||||
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License
|
||||
# 2.0; you may not use this file except in compliance with the Elastic License
|
||||
# 2.0.
|
||||
|
||||
"""Schemas and dataclasses for GitHub releases."""
|
||||
|
||||
import dataclasses
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import shutil
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Tuple
|
||||
from zipfile import ZipFile
|
||||
|
||||
import click
|
||||
import requests
|
||||
|
||||
from .schemas import definitions
|
||||
|
||||
# this is primarily for type hinting - all use of the github client should come from GithubClient class
|
||||
try:
|
||||
from github import Github
|
||||
from github.Repository import Repository
|
||||
from github.GitRelease import GitRelease
|
||||
from github.GitReleaseAsset import GitReleaseAsset
|
||||
except ImportError:
|
||||
# for type hinting
|
||||
Github = None # noqa: N806
|
||||
Repository = None # noqa: N806
|
||||
GitRelease = None # noqa: N806
|
||||
GitReleaseAsset = None # noqa: N806
|
||||
|
||||
|
||||
def get_gh_release(repo: Repository, release_name: Optional[str] = None, tag_name: Optional[str] = None) -> GitRelease:
|
||||
"""Get a list of GitHub releases by repo."""
|
||||
assert release_name or tag_name, 'Must specify a release_name or tag_name'
|
||||
|
||||
releases = repo.get_releases()
|
||||
for release in releases:
|
||||
if release_name and release_name == release.title:
|
||||
return release
|
||||
elif tag_name and tag_name == release.tag_name:
|
||||
return release
|
||||
|
||||
|
||||
def load_zipped_gh_assets_with_metadata(url: str) -> Tuple[str, dict]:
|
||||
"""Download and unzip a GitHub assets."""
|
||||
response = requests.get(url)
|
||||
zipped_asset = ZipFile(io.BytesIO(response.content))
|
||||
zipped_sha256 = hashlib.sha256(response.content).hexdigest()
|
||||
|
||||
assets = {}
|
||||
for zipped in zipped_asset.filelist:
|
||||
if zipped.is_dir():
|
||||
continue
|
||||
|
||||
contents = zipped_asset.read(zipped.filename)
|
||||
sha256 = hashlib.sha256(contents).hexdigest()
|
||||
|
||||
assets[zipped.filename] = {
|
||||
'contents': contents,
|
||||
'metadata': {
|
||||
'compress_size': zipped.compress_size,
|
||||
# zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday
|
||||
'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)),
|
||||
'sha256': sha256,
|
||||
'size': zipped.file_size,
|
||||
}
|
||||
}
|
||||
|
||||
return zipped_sha256, assets
|
||||
|
||||
|
||||
def load_json_gh_asset(url: str) -> dict:
|
||||
"""Load and return the contents of a json asset file."""
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def download_gh_asset(url: str, path: str, overwrite=False):
|
||||
"""Download and unzip a GitHub asset."""
|
||||
zipped = requests.get(url)
|
||||
z = ZipFile(io.BytesIO(zipped.content))
|
||||
|
||||
Path(path).mkdir(exist_ok=True)
|
||||
if overwrite:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
|
||||
z.extractall(path)
|
||||
click.echo(f'files saved to {path}')
|
||||
|
||||
z.close()
|
||||
|
||||
|
||||
class GithubClient:
|
||||
"""GitHub client wrapper."""
|
||||
|
||||
def __init__(self, token: Optional[str] = None):
|
||||
"""Get an unauthenticated client, verified authenticated client, or a default client."""
|
||||
if not Github:
|
||||
raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`')
|
||||
|
||||
self.client: Github = Github(token)
|
||||
self.unauthenticated_client = Github()
|
||||
self.__token = token
|
||||
self.__authenticated_client = None
|
||||
|
||||
@property
|
||||
def authenticated_client(self) -> Github:
|
||||
if not self.__token:
|
||||
raise ValueError('Token not defined! Re-instantiate with a token or use add_token method')
|
||||
if not self.__authenticated_client:
|
||||
self.__authenticated_client = Github(self.__token)
|
||||
return self.__authenticated_client
|
||||
|
||||
def add_token(self, token):
|
||||
self.__token = token
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssetManifestEntry:
|
||||
|
||||
compress_size: int
|
||||
created_at: datetime
|
||||
name: str
|
||||
sha256: str
|
||||
size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssetManifestMetadata:
|
||||
|
||||
relative_url: str
|
||||
entries: Dict[str, AssetManifestEntry]
|
||||
zipped_sha256: definitions.Sha256
|
||||
created_at: datetime = field(default_factory=datetime.utcnow)
|
||||
description: Optional[str] = None # populated by GitHub release asset label
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReleaseManifest:
|
||||
|
||||
assets: Dict[str, AssetManifestMetadata]
|
||||
assets_url: str
|
||||
author: str # parsed from GitHub release metadata as: author[login]
|
||||
created_at: str
|
||||
html_url: str
|
||||
id: int
|
||||
name: str
|
||||
published_at: str
|
||||
url: str
|
||||
zipball_url: str
|
||||
tag_name: str = None
|
||||
description: str = None # parsed from GitHub release metadata as: body
|
||||
|
||||
|
||||
class ManifestManager:
|
||||
"""Manifest handler for GitHub releases."""
|
||||
|
||||
def __init__(self, repo: str = 'elastic/detection-rules', release_name: Optional[str] = None,
|
||||
tag_name: Optional[str] = None, token: Optional[str] = None):
|
||||
self.repo_name = repo
|
||||
self.release_name = release_name
|
||||
self.tag_name = tag_name
|
||||
self.gh_client = GithubClient(token)
|
||||
self.has_token = token is not None
|
||||
|
||||
self.repo: Repository = self.gh_client.client.get_repo(repo)
|
||||
self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name)
|
||||
|
||||
if not self.release:
|
||||
raise ValueError(f'No release found for {tag_name or release_name}')
|
||||
|
||||
if not self.release_name:
|
||||
self.release_name = self.release.title
|
||||
|
||||
self.manifest_name = f'manifest-{self.release_name}.json'
|
||||
self.assets: dict = self._get_enriched_assets_from_release()
|
||||
self.release_manifest = self._create()
|
||||
self.__release_manifest_dict = dataclasses.asdict(self.release_manifest)
|
||||
self.manifest_size = len(json.dumps(self.__release_manifest_dict))
|
||||
|
||||
@property
|
||||
def release_manifest_fl(self) -> io.BytesIO:
|
||||
return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8'))
|
||||
|
||||
def _create(self) -> ReleaseManifest:
|
||||
"""Create the manifest from GitHub asset metadata and file contents."""
|
||||
assets = {}
|
||||
for asset_name, asset_data in self.assets.items():
|
||||
entries = {}
|
||||
data = asset_data['data']
|
||||
metadata = asset_data['metadata']
|
||||
|
||||
for file_name, file_data in data.items():
|
||||
file_metadata = file_data['metadata']
|
||||
|
||||
name = Path(file_name).name
|
||||
file_metadata.update(name=name)
|
||||
|
||||
entry = AssetManifestEntry(**file_metadata)
|
||||
entries[name] = entry
|
||||
|
||||
assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries,
|
||||
metadata['zipped_sha256'], metadata['created_at'],
|
||||
metadata['label'])
|
||||
|
||||
release_metadata = self._parse_release_metadata()
|
||||
release_metadata.update(assets=assets)
|
||||
release_manifest = ReleaseManifest(**release_metadata)
|
||||
|
||||
return release_manifest
|
||||
|
||||
def _parse_release_metadata(self) -> dict:
|
||||
"""Parse relevant info from GitHub metadata for release manifest."""
|
||||
ignore = ['assets']
|
||||
manual_set_keys = ['author', 'description']
|
||||
keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys]
|
||||
parsed = {k: self.release.raw_data[k] for k in keys}
|
||||
parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login'])
|
||||
return parsed
|
||||
|
||||
def save(self) -> GitReleaseAsset:
|
||||
"""Save manifest files."""
|
||||
if not self.has_token:
|
||||
raise ValueError('You must provide a token to save a manifest to a GitHub release')
|
||||
|
||||
asset = self.release.upload_asset_from_memory(self.release_manifest_fl,
|
||||
self.manifest_size,
|
||||
self.manifest_name)
|
||||
click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}')
|
||||
return asset
|
||||
|
||||
@classmethod
|
||||
def load(cls, name: str, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> Optional[dict]:
|
||||
"""Load a manifest."""
|
||||
gh_client = GithubClient(token)
|
||||
repo = gh_client.client.get_repo(repo)
|
||||
release = get_gh_release(repo, tag_name=name)
|
||||
|
||||
for asset in release.get_assets():
|
||||
if asset.name == f'manifest-{name}.json':
|
||||
return load_json_gh_asset(asset.browser_download_url)
|
||||
|
||||
@classmethod
|
||||
def load_all(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None
|
||||
) -> Tuple[Dict[str, dict], list]:
|
||||
"""Load a consolidated manifest."""
|
||||
gh_client = GithubClient(token)
|
||||
repo = gh_client.client.get_repo(repo)
|
||||
|
||||
consolidated = {}
|
||||
missing = set()
|
||||
for release in repo.get_releases():
|
||||
name = release.tag_name
|
||||
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
|
||||
if not asset:
|
||||
missing.add(name)
|
||||
else:
|
||||
consolidated[name] = load_json_gh_asset(asset.browser_download_url)
|
||||
|
||||
return consolidated, list(missing)
|
||||
|
||||
@classmethod
|
||||
def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token: Optional[str] = None) -> dict:
|
||||
"""Load all assets with their hashes, by release."""
|
||||
flat = {}
|
||||
consolidated, _ = cls.load_all(repo=repo, token=token)
|
||||
for release, data in consolidated.items():
|
||||
for asset in data['assets'].values():
|
||||
flat_release = flat[release] = {}
|
||||
for asset_name, asset_data in asset['entries'].items():
|
||||
flat_release[asset_name] = asset_data['sha256']
|
||||
|
||||
return flat
|
||||
|
||||
def _get_enriched_assets_from_release(self) -> dict:
|
||||
"""Get assets and metadata from a GitHub release."""
|
||||
assets = {}
|
||||
for asset in [a.raw_data for a in self.release.get_assets()]:
|
||||
zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url'])
|
||||
asset.update(zipped_sha256=zipped_sha256)
|
||||
|
||||
assets[asset['name']] = {
|
||||
'metadata': asset,
|
||||
'data': data
|
||||
}
|
||||
|
||||
return assets
|
||||
+1
-298
@@ -4,22 +4,14 @@
|
||||
# 2.0.
|
||||
|
||||
"""Misc support."""
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
import dataclasses
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Dict, NoReturn, Tuple
|
||||
from zipfile import ZipFile
|
||||
from typing import NoReturn
|
||||
|
||||
import click
|
||||
import requests
|
||||
@@ -57,295 +49,6 @@ JS_LICENSE = """
|
||||
""".strip().format("\n".join(' * ' + line for line in LICENSE_LINES))
|
||||
|
||||
|
||||
def get_gh_release(repo: Repository, release_name=None, tag_name=None) -> GitRelease:
|
||||
"""Get a list of GitHub releases by repo."""
|
||||
assert release_name or tag_name, 'Must specify a release_name or tag_name'
|
||||
|
||||
releases = repo.get_releases()
|
||||
release = next((r for r in releases
|
||||
if (release_name and release_name == r.title) or (tag_name and tag_name == r.tag_name)), None)
|
||||
|
||||
return release
|
||||
|
||||
|
||||
def upload_gh_release_asset(token, asset: bytes, asset_name, repo=None, content_type='application/zip',
|
||||
release_name=None, tag_name=None, upload_url=None):
|
||||
"""Save a Github relase asset."""
|
||||
if not upload_url:
|
||||
assert repo, 'You must provide a repo name if not providing an upload_url'
|
||||
|
||||
release = get_gh_release(repo, release_name, tag_name)
|
||||
upload_url = release['upload_url']
|
||||
suffix = '{?name,label}'
|
||||
upload_url = upload_url.replace(suffix, f'?name={asset_name}&label={asset_name}')
|
||||
|
||||
headers = {'content-type': content_type}
|
||||
r = requests.post(upload_url, auth=('', token), data=asset, headers=headers)
|
||||
r.raise_for_status()
|
||||
click.echo(f'Uploaded {asset_name} to release: {r.json()["url"]}')
|
||||
|
||||
|
||||
def load_zipped_gh_assets_with_metadata(url) -> Tuple[str, dict]:
|
||||
"""Download and unzip a GitHub assets."""
|
||||
response = requests.get(url)
|
||||
zipped_asset = ZipFile(io.BytesIO(response.content))
|
||||
zipped_sha256 = hashlib.sha256(response.content).hexdigest()
|
||||
|
||||
assets = {}
|
||||
for zipped in zipped_asset.filelist:
|
||||
if zipped.is_dir():
|
||||
continue
|
||||
|
||||
contents = zipped_asset.read(zipped.filename)
|
||||
sha256 = hashlib.sha256(contents).hexdigest()
|
||||
|
||||
assets[zipped.filename] = {
|
||||
'contents': contents,
|
||||
'metadata': {
|
||||
'compress_size': zipped.compress_size,
|
||||
# zipfile provides only a 6 tuple datetime; -1 means DST is unknown; 0's set tm_wday and tm_yday
|
||||
'created_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', zipped.date_time + (0, 0, -1)),
|
||||
'sha256': sha256,
|
||||
'size': zipped.file_size,
|
||||
}
|
||||
}
|
||||
|
||||
return zipped_sha256, assets
|
||||
|
||||
|
||||
def load_json_gh_asset(url) -> dict:
|
||||
"""Load and return the contents of a json asset file."""
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def download_gh_asset(url, path, overwrite=False):
|
||||
"""Download and unzip a GitHub asset."""
|
||||
zipped = requests.get(url)
|
||||
z = ZipFile(io.BytesIO(zipped.content))
|
||||
|
||||
Path(path).mkdir(exist_ok=True)
|
||||
if overwrite:
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
|
||||
z.extractall(path)
|
||||
click.echo(f'files saved to {path}')
|
||||
|
||||
z.close()
|
||||
|
||||
|
||||
class GithubClient:
|
||||
"""GitHub client wrapper."""
|
||||
|
||||
def __init__(self, token=None):
|
||||
"""Get an unauthenticated client, verified authenticated client, or a default client."""
|
||||
if not Github:
|
||||
raise ModuleNotFoundError('Missing PyGithub - try running `pip install -r requirements-dev.txt`')
|
||||
|
||||
self.client: Github = Github(token)
|
||||
self.unauthenticated_client = Github()
|
||||
self.__token = token
|
||||
self.__authenticated_client = None
|
||||
|
||||
@property
|
||||
def authenticated_client(self) -> Github:
|
||||
if not self.__token:
|
||||
raise ValueError('Token not defined! Re-instantiate with a token or use add_token method')
|
||||
if not self.__authenticated_client:
|
||||
self.__authenticated_client = Github(self.__token)
|
||||
return self.__authenticated_client
|
||||
|
||||
def add_token(self, token):
|
||||
self.__token = token
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssetManifestEntry:
|
||||
|
||||
compress_size: int
|
||||
created_at: datetime
|
||||
name: str
|
||||
sha256: str
|
||||
size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class AssetManifestMetadata:
|
||||
|
||||
relative_url: str
|
||||
entries: Dict[str, AssetManifestEntry]
|
||||
zipped_sha256: str
|
||||
created_at: datetime = field(default_factory=datetime.utcnow)
|
||||
description: str = None # label
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReleaseManifest:
|
||||
|
||||
assets: Dict[str, AssetManifestMetadata]
|
||||
assets_url: str
|
||||
author: str # parsed from GitHub release metadata as: author[login]
|
||||
created_at: str
|
||||
html_url: str
|
||||
id: int
|
||||
name: str
|
||||
published_at: str
|
||||
url: str
|
||||
zipball_url: str
|
||||
tag_name: str = None
|
||||
description: str = None # body
|
||||
|
||||
|
||||
class Manifest:
|
||||
"""Manifest handler for GitHub releases."""
|
||||
|
||||
def __init__(self, repo: str = 'elastic/detection-rules', release_name=None, tag_name=None, token=None):
|
||||
self.repo_name = repo
|
||||
self.release_name = release_name
|
||||
self.tag_name = tag_name
|
||||
self.gh_client = GithubClient(token)
|
||||
self.has_token = token is not None
|
||||
|
||||
self.repo: Repository = self.gh_client.client.get_repo(repo)
|
||||
self.release: GitRelease = get_gh_release(self.repo, release_name, tag_name)
|
||||
|
||||
if not self.release:
|
||||
raise ValueError(f'No release found for {tag_name or release_name}')
|
||||
|
||||
if not self.release_name:
|
||||
self.release_name = self.release.title
|
||||
|
||||
self.manifest_name = f'manifest-{self.release_name}.json'
|
||||
self.assets: dict = self._get_enriched_assets_from_release()
|
||||
self.release_manifest = self._create()
|
||||
self.__release_manifest_dict = dataclasses.asdict(self.release_manifest)
|
||||
self.manifest_size = len(json.dumps(self.__release_manifest_dict))
|
||||
|
||||
@property
|
||||
def release_manifest_fl(self):
|
||||
return io.BytesIO(json.dumps(self.__release_manifest_dict, sort_keys=True).encode('utf-8'))
|
||||
|
||||
def _create(self):
|
||||
"""Create the manifest from GitHub asset metadata and file contents."""
|
||||
assets = {}
|
||||
for asset_name, asset_data in self.assets.items():
|
||||
entries = {}
|
||||
data = asset_data['data']
|
||||
metadata = asset_data['metadata']
|
||||
|
||||
for file_name, file_data in data.items():
|
||||
file_metadata = file_data['metadata']
|
||||
|
||||
name = Path(file_name).name
|
||||
file_metadata.update(name=name)
|
||||
|
||||
entry = AssetManifestEntry(**file_metadata)
|
||||
entries[name] = entry
|
||||
|
||||
assets[asset_name] = AssetManifestMetadata(metadata['browser_download_url'], entries,
|
||||
metadata['zipped_sha256'], metadata['created_at'],
|
||||
metadata['label'])
|
||||
|
||||
release_metadata = self._parse_release_metadata()
|
||||
release_metadata.update(assets=assets)
|
||||
release_manifest = ReleaseManifest(**release_metadata)
|
||||
|
||||
return release_manifest
|
||||
|
||||
def _parse_release_metadata(self):
|
||||
"""Parse relevant info from GitHub metadata for release manifest."""
|
||||
ignore = ['assets']
|
||||
manual_set_keys = ['author', 'description']
|
||||
keys = [f.name for f in dataclasses.fields(ReleaseManifest) if f.name not in ignore + manual_set_keys]
|
||||
parsed = {k: self.release.raw_data[k] for k in keys}
|
||||
parsed.update(description=self.release.raw_data['body'], author=self.release.raw_data['author']['login'])
|
||||
return parsed
|
||||
|
||||
def save(self) -> GitReleaseAsset:
|
||||
"""Save manifest files."""
|
||||
if not self.has_token:
|
||||
raise ValueError('You must provide a token to save a manifest to a GitHub release')
|
||||
|
||||
asset = self.release.upload_asset_from_memory(self.release_manifest_fl,
|
||||
self.manifest_size,
|
||||
self.manifest_name)
|
||||
click.echo(f'Manifest saved as {self.manifest_name} to {self.release.html_url}')
|
||||
return asset
|
||||
|
||||
@classmethod
|
||||
def load(cls, name: str, repo: str = 'elastic/detection-rules', token=None) -> dict:
|
||||
"""Load a manifest."""
|
||||
gh_client = GithubClient(token)
|
||||
repo = gh_client.client.get_repo(repo)
|
||||
release = get_gh_release(repo, tag_name=name)
|
||||
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
|
||||
|
||||
if asset is not None:
|
||||
return load_json_gh_asset(asset.browser_download_url)
|
||||
|
||||
@classmethod
|
||||
def load_all(cls, repo: str = 'elastic/detection-rules', token=None) -> Tuple[Dict[str, dict], list]:
|
||||
"""Load a consolidated manifest."""
|
||||
gh_client = GithubClient(token)
|
||||
repo = gh_client.client.get_repo(repo)
|
||||
|
||||
consolidated = {}
|
||||
missing = set()
|
||||
for release in repo.get_releases():
|
||||
name = release.tag_name
|
||||
asset = next((a for a in release.get_assets() if a.name == f'manifest-{name}.json'), None)
|
||||
if not asset:
|
||||
missing.add(name)
|
||||
else:
|
||||
consolidated[name] = load_json_gh_asset(asset.browser_download_url)
|
||||
|
||||
return consolidated, list(missing)
|
||||
|
||||
@classmethod
|
||||
def get_existing_asset_hashes(cls, repo: str = 'elastic/detection-rules', token=None) -> dict:
|
||||
"""Load all assets with their hashes, by release."""
|
||||
flat = {}
|
||||
consolidated, _ = cls.load_all(repo=repo, token=token)
|
||||
for release, data in consolidated.items():
|
||||
for asset in data['assets'].values():
|
||||
flat_release = flat[release] = {}
|
||||
for asset_name, asset_data in asset['entries'].items():
|
||||
flat_release[asset_name] = asset_data['sha256']
|
||||
|
||||
return flat
|
||||
|
||||
def _get_enriched_assets_from_release(self):
|
||||
"""Get assets and metadata from a GitHub release."""
|
||||
assets = {}
|
||||
for asset in [a.raw_data for a in self.release.get_assets()]:
|
||||
zipped_sha256, data = load_zipped_gh_assets_with_metadata(asset['browser_download_url'])
|
||||
asset.update(zipped_sha256=zipped_sha256)
|
||||
|
||||
assets[asset['name']] = {
|
||||
'metadata': asset,
|
||||
'data': data
|
||||
}
|
||||
|
||||
return assets
|
||||
|
||||
|
||||
def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]:
|
||||
"""Load all ML DGA model release manifests by model id."""
|
||||
manifests, _ = Manifest.load_all(repo=repo)
|
||||
model_manifests = {}
|
||||
|
||||
for manifest_name, manifest in manifests.items():
|
||||
for asset_name, asset in manifest['assets'].items():
|
||||
for entry_name, entry_data in asset['entries'].items():
|
||||
if entry_name.startswith('dga') and entry_name.endswith('model.json'):
|
||||
model_id, _ = entry_name.rsplit('_', 1)
|
||||
model_manifests[model_id] = ReleaseManifest(**manifest)
|
||||
break
|
||||
|
||||
return model_manifests
|
||||
|
||||
|
||||
class ClientError(click.ClickException):
|
||||
"""Custom CLI error to format output or full debug stacktrace."""
|
||||
|
||||
|
||||
@@ -0,0 +1,452 @@
|
||||
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License
|
||||
# 2.0; you may not use this file except in compliance with the Elastic License
|
||||
# 2.0.
|
||||
|
||||
"""Schemas and dataclasses for experimental ML features."""
|
||||
|
||||
import io
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property, lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Literal, Optional
|
||||
|
||||
import click
|
||||
import elasticsearch
|
||||
import json
|
||||
import requests
|
||||
from eql.table import Table
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.client import IngestClient, LicenseClient, MlClient
|
||||
|
||||
from .eswrap import es_experimental
|
||||
from .ghwrap import ManifestManager, ReleaseManifest
|
||||
from .misc import client_error
|
||||
from .schemas import definitions
|
||||
from .utils import get_path, unzip_to_dict
|
||||
|
||||
|
||||
ML_PATH = Path(get_path('machine-learning'))
|
||||
|
||||
|
||||
def info_from_tag(tag: str) -> (Literal['ml'], definitions.MachineLearningType, str, int):
|
||||
try:
|
||||
ml, release_type, release_date, release_number = tag.split('-')
|
||||
except ValueError as exc:
|
||||
raise ValueError(f'{tag} is not of valid release format: ml-type-date-number. {exc}')
|
||||
|
||||
return ml, release_type, release_date, int(release_number)
|
||||
|
||||
|
||||
class InvalidLicenseError(Exception):
|
||||
"""Invalid stack license for ML features requiring platinum or enterprise."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class MachineLearningClient:
|
||||
"""Class for experimental machine learning release clients."""
|
||||
|
||||
es_client: Elasticsearch
|
||||
bundle: dict
|
||||
|
||||
@cached_property
|
||||
def model_id(self) -> str:
|
||||
return next(data['model_id'] for name, data in self.bundle.items() if Path(name).stem.lower().endswith('model'))
|
||||
|
||||
@cached_property
|
||||
def bundle_type(self) -> str:
|
||||
return self.model_id.split('_')[0].lower()
|
||||
|
||||
@cached_property
|
||||
def ml_client(self) -> MlClient:
|
||||
return MlClient(self.es_client)
|
||||
|
||||
@cached_property
|
||||
def ingest_client(self) -> IngestClient:
|
||||
return IngestClient(self.es_client)
|
||||
|
||||
@cached_property
|
||||
def license(self) -> str:
|
||||
license_client = LicenseClient(self.es_client)
|
||||
return license_client.get()['license']['type'].lower()
|
||||
|
||||
@staticmethod
|
||||
@lru_cache
|
||||
def ml_manifests() -> Dict[str, ReleaseManifest]:
|
||||
return get_ml_model_manifests_by_model_id()
|
||||
|
||||
def verify_license(self):
|
||||
valid_license = self.license in ('platinum', 'enterprise')
|
||||
|
||||
if not valid_license:
|
||||
err_msg = 'Your subscription level does not support Machine Learning. See ' \
|
||||
'https://www.elastic.co/subscriptions for more information.'
|
||||
raise InvalidLicenseError(err_msg)
|
||||
|
||||
@classmethod
|
||||
def from_release(cls, es_client: Elasticsearch, release_tag: str,
|
||||
repo: str = 'elastic/detection-rules') -> 'MachineLearningClient':
|
||||
"""Load from a GitHub release."""
|
||||
full_type = '-'.join(info_from_tag(release_tag)[:2])
|
||||
release_url = f'https://api.github.com/repos/{repo}/releases/tags/{release_tag}'
|
||||
release = requests.get(release_url)
|
||||
release.raise_for_status()
|
||||
|
||||
# check that the release only has a single zip file
|
||||
assets = [a for a in release.json()['assets'] if
|
||||
a['name'].startswith(full_type) and a['name'].endswith('.zip')]
|
||||
assert len(assets) == 1, f'Malformed release: expected 1 {full_type} zip file, found: {len(assets)}!'
|
||||
|
||||
zipped_url = assets[0]['browser_download_url']
|
||||
zipped_raw = requests.get(zipped_url)
|
||||
zipped_bundle = zipfile.ZipFile(io.BytesIO(zipped_raw.content))
|
||||
bundle = unzip_to_dict(zipped_bundle)
|
||||
|
||||
return cls(es_client=es_client, bundle=bundle)
|
||||
|
||||
@classmethod
|
||||
def from_directory(cls, es_client: Elasticsearch, directory: Path) -> 'MachineLearningClient':
|
||||
"""Load from an unzipped local directory."""
|
||||
bundle = json.loads(directory.read_text())
|
||||
return cls(es_client=es_client, bundle=bundle)
|
||||
|
||||
def remove(self) -> dict:
|
||||
"""Remove machine learning files from a stack."""
|
||||
results = dict(script={}, pipeline={}, model={})
|
||||
for pipeline in list(self.get_related_pipelines()):
|
||||
results['pipeline'][pipeline] = self.ingest_client.delete_pipeline(pipeline)
|
||||
for script in list(self.get_related_scripts()):
|
||||
results['script'][script] = self.es_client.delete_script(script)
|
||||
|
||||
results['model'][self.model_id] = self.ml_client.delete_trained_model(self.model_id)
|
||||
return results
|
||||
|
||||
def setup(self) -> dict:
|
||||
"""Setup machine learning bundle on a stack."""
|
||||
self.verify_license()
|
||||
results = dict(script={}, pipeline={}, model={})
|
||||
|
||||
# upload in order: model, scripts, then pipelines
|
||||
parsed_bundle = dict(model={}, script={}, pipeline={})
|
||||
for filename, data in self.bundle.items():
|
||||
fp = Path(filename)
|
||||
file_type = fp.stem.split('_')[-1]
|
||||
parsed_bundle[file_type][fp.stem] = data
|
||||
|
||||
model = list(parsed_bundle['model'].values())[0]
|
||||
results['model'][model['model_id']] = self.upload_model(model['model_id'], model)
|
||||
|
||||
for script_name, script in parsed_bundle['script'].items():
|
||||
results['script'][script_name] = self.upload_script(script_name, script)
|
||||
|
||||
for pipeline_name, pipeline in parsed_bundle['pipeline'].items():
|
||||
results['pipeline'][pipeline_name] = self.upload_ingest_pipeline(pipeline_name, pipeline)
|
||||
|
||||
return results
|
||||
|
||||
def get_all_scripts(self) -> Dict[str, dict]:
|
||||
"""Get all scripts from an elasticsearch instance."""
|
||||
return self.es_client.cluster.state()['metadata']['stored_scripts']
|
||||
|
||||
def get_related_scripts(self) -> Dict[str, dict]:
|
||||
"""Get all scripts which start with ml_*."""
|
||||
scripts = self.get_all_scripts()
|
||||
return {n: s for n, s in scripts.items() if n.lower().startswith(f'ml_{self.bundle_type}')}
|
||||
|
||||
def get_related_pipelines(self) -> Dict[str, dict]:
|
||||
"""Get all pipelines which start with ml_*."""
|
||||
pipelines = self.ingest_client.get_pipeline()
|
||||
return {n: s for n, s in pipelines.items() if n.lower().startswith(f'ml_{self.bundle_type}')}
|
||||
|
||||
def get_related_model(self) -> Optional[dict]:
|
||||
"""Get a model from an elasticsearch instance matching the model_id."""
|
||||
for model in self.get_all_existing_model_files():
|
||||
if model['model_id'] == self.model_id:
|
||||
return model
|
||||
|
||||
def get_all_existing_model_files(self) -> dict:
|
||||
"""Get available models from a stack."""
|
||||
return self.ml_client.get_trained_models()['trained_model_configs']
|
||||
|
||||
@classmethod
|
||||
def get_existing_model_ids(cls, es_client: Elasticsearch) -> List[str]:
|
||||
"""Get model IDs for existing ML models."""
|
||||
ml_client = MlClient(es_client)
|
||||
return [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs']
|
||||
if m['model_id'] in cls.ml_manifests()]
|
||||
|
||||
@classmethod
|
||||
def check_model_exists(cls, es_client: Elasticsearch, model_id: str) -> bool:
|
||||
"""Check if a model exists on a stack by model id."""
|
||||
ml_client = MlClient(es_client)
|
||||
return model_id in [m['model_id'] for m in ml_client.get_trained_models()['trained_model_configs']]
|
||||
|
||||
def get_related_files(self) -> dict:
|
||||
"""Check for the presence and status of ML bundle files on a stack."""
|
||||
files = {
|
||||
'pipeline': self.get_related_pipelines(),
|
||||
'script': self.get_related_scripts(),
|
||||
'model': self.get_related_model(),
|
||||
'release': self.get_related_release()
|
||||
}
|
||||
return files
|
||||
|
||||
def get_related_release(self) -> ReleaseManifest:
|
||||
"""Get the GitHub release related to a model."""
|
||||
return self.ml_manifests.get(self.model_id)
|
||||
|
||||
@classmethod
|
||||
def get_all_ml_files(cls, es_client: Elasticsearch) -> dict:
|
||||
"""Get all scripts, pipelines, and models which start with ml_*."""
|
||||
pipelines = IngestClient(es_client).get_pipeline()
|
||||
scripts = es_client.cluster.state()['metadata']['stored_scripts']
|
||||
models = MlClient(es_client).get_trained_models()['trained_model_configs']
|
||||
manifests = get_ml_model_manifests_by_model_id()
|
||||
|
||||
files = {
|
||||
'pipeline': {n: s for n, s in pipelines.items() if n.lower().startswith('ml_')},
|
||||
'script': {n: s for n, s in scripts.items() if n.lower().startswith('ml_')},
|
||||
'model': {m['model_id']: {'model': m, 'release': manifests[m['model_id']]}
|
||||
for m in models if m['model_id'] in manifests},
|
||||
}
|
||||
return files
|
||||
|
||||
@classmethod
|
||||
def remove_ml_scripts_pipelines(cls, es_client: Elasticsearch, ml_type: List[str]) -> dict:
|
||||
"""Remove all ML script and pipeline files."""
|
||||
results = dict(script={}, pipeline={})
|
||||
ingest_client = IngestClient(es_client)
|
||||
|
||||
files = cls.get_all_ml_files(es_client=es_client)
|
||||
for file_type, data in files.items():
|
||||
for name in list(data):
|
||||
this_type = name.split('_')[1].lower()
|
||||
if this_type not in ml_type:
|
||||
continue
|
||||
if file_type == 'script':
|
||||
results[file_type][name] = es_client.delete_script(name)
|
||||
elif file_type == 'pipeline':
|
||||
results[file_type][name] = ingest_client.delete_pipeline(name)
|
||||
|
||||
return results
|
||||
|
||||
def upload_model(self, model_id: str, body: dict) -> dict:
|
||||
"""Upload an ML model file."""
|
||||
return self.ml_client.put_trained_model(model_id=model_id, body=body)
|
||||
|
||||
def upload_script(self, script_id: str, body: dict) -> dict:
|
||||
"""Install a script file."""
|
||||
return self.es_client.put_script(id=script_id, body=body)
|
||||
|
||||
def upload_ingest_pipeline(self, pipeline_id: str, body: dict) -> dict:
|
||||
"""Install a pipeline file."""
|
||||
return self.ingest_client.put_pipeline(id=pipeline_id, body=body)
|
||||
|
||||
@staticmethod
|
||||
def _build_script_error(exc: elasticsearch.RequestError, pipeline_file: str):
|
||||
"""Build an error for a failed script upload."""
|
||||
error = exc.info['error']
|
||||
cause = error['caused_by']
|
||||
error_msg = [
|
||||
f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
|
||||
' '.join(f'{k}: {v}' for k, v in error['position'].items()),
|
||||
'\n'.join(error['script_stack'])
|
||||
]
|
||||
|
||||
return click.style('\n'.join(error_msg), fg='red')
|
||||
|
||||
|
||||
def get_ml_model_manifests_by_model_id(repo: str = 'elastic/detection-rules') -> Dict[str, ReleaseManifest]:
|
||||
"""Load all ML DGA model release manifests by model id."""
|
||||
manifests, _ = ManifestManager.load_all(repo=repo)
|
||||
model_manifests = {}
|
||||
|
||||
for manifest_name, manifest in manifests.items():
|
||||
for asset_name, asset in manifest['assets'].items():
|
||||
for entry_name, entry_data in asset['entries'].items():
|
||||
if entry_name.startswith('dga') and entry_name.endswith('model.json'):
|
||||
model_id, _ = entry_name.rsplit('_', 1)
|
||||
model_manifests[model_id] = ReleaseManifest(**manifest)
|
||||
break
|
||||
|
||||
return model_manifests
|
||||
|
||||
|
||||
@es_experimental.group('ml')
|
||||
def ml_group():
|
||||
"""Experimental machine learning commands."""
|
||||
|
||||
|
||||
@ml_group.command('check-files')
|
||||
@click.pass_context
|
||||
def check_files(ctx):
|
||||
"""Check ML model files on an elasticsearch instance."""
|
||||
files = MachineLearningClient.get_all_ml_files(ctx.obj['es'])
|
||||
|
||||
results = []
|
||||
for file_type, data in files.items():
|
||||
if file_type == 'model':
|
||||
continue
|
||||
for name in list(data):
|
||||
results.append({'file_type': file_type, 'name': name})
|
||||
|
||||
for model_name, model in files['model'].items():
|
||||
results.append({'file_type': 'model', 'name': model_name, 'related_release': model['release'].tag_name})
|
||||
|
||||
fields = ['file_type', 'name', 'related_release']
|
||||
table = Table.from_list(fields, results)
|
||||
click.echo(table)
|
||||
return files
|
||||
|
||||
|
||||
@ml_group.command('remove-model')
|
||||
@click.argument('model-id', required=False)
|
||||
@click.pass_context
|
||||
def remove_model(ctx: click.Context, model_id):
|
||||
"""Remove ML model files."""
|
||||
es_client = MlClient(ctx.obj['es'])
|
||||
model_ids = MachineLearningClient.get_existing_model_ids(ctx.obj['es'])
|
||||
|
||||
if not model_id:
|
||||
model_id = click.prompt('Model ID to remove', type=click.Choice(model_ids))
|
||||
|
||||
try:
|
||||
result = es_client.delete_trained_model(model_id)
|
||||
except elasticsearch.ConflictError as e:
|
||||
click.echo(f'{e}: try running `remove-scripts-pipelines` first')
|
||||
ctx.exit(1)
|
||||
|
||||
table = Table.from_list(['model_id', 'status'], [{'model_id': model_id, 'status': result}])
|
||||
click.echo(table)
|
||||
return result
|
||||
|
||||
|
||||
@ml_group.command('remove-scripts-pipelines')
|
||||
@click.option('--dga', is_flag=True)
|
||||
@click.option('--problemchild', is_flag=True)
|
||||
@click.pass_context
|
||||
def remove_scripts_pipelines(ctx: click.Context, **ml_types):
|
||||
"""Remove ML scripts and pipeline files."""
|
||||
selected_types = [k for k, v in ml_types.items() if v]
|
||||
assert selected_types, f'Specify ML types to remove: {list(ml_types)}'
|
||||
status = MachineLearningClient.remove_ml_scripts_pipelines(es_client=ctx.obj['es'], ml_type=selected_types)
|
||||
|
||||
results = []
|
||||
for file_type, response in status.items():
|
||||
for name, result in response.items():
|
||||
results.append({'file_type': file_type, 'name': name, 'status': result})
|
||||
|
||||
fields = ['file_type', 'name', 'status']
|
||||
table = Table.from_list(fields, results)
|
||||
click.echo(table)
|
||||
return status
|
||||
|
||||
|
||||
@ml_group.command('setup')
|
||||
@click.option('--model-tag', '-t',
|
||||
help='Release tag for model files staged in detection-rules (required to download files)')
|
||||
@click.option('--repo', '-r', default='elastic/detection-rules',
|
||||
help='GitHub repository hosting the model file releases (owner/repo)')
|
||||
@click.option('--model-dir', '-d', type=click.Path(exists=True, file_okay=False),
|
||||
help='Directory containing local model files')
|
||||
@click.pass_context
|
||||
def setup_bundle(ctx, model_tag, repo, model_dir):
|
||||
"""Upload ML model and dependencies to enrich data."""
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
|
||||
if model_tag:
|
||||
dga_client = MachineLearningClient.from_release(es_client=es_client, release_tag=model_tag, repo=repo)
|
||||
elif model_dir:
|
||||
dga_client = MachineLearningClient.from_directory(es_client=es_client, directory=model_dir)
|
||||
else:
|
||||
return client_error('model-tag or model-dir required to download model files')
|
||||
|
||||
dga_client.verify_license()
|
||||
status = dga_client.setup()
|
||||
|
||||
results = []
|
||||
for file_type, response in status.items():
|
||||
for name, result in response.items():
|
||||
if file_type == 'model':
|
||||
status = 'success' if result.get('create_time') else 'potential_failure'
|
||||
results.append({'file_type': file_type, 'name': name, 'status': status})
|
||||
continue
|
||||
results.append({'file_type': file_type, 'name': name, 'status': result})
|
||||
|
||||
fields = ['file_type', 'name', 'status']
|
||||
table = Table.from_list(fields, results)
|
||||
click.echo(table)
|
||||
|
||||
click.echo('Associated rules and jobs can be found under ML-experimental-detections releases in the repo')
|
||||
click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
|
||||
click.echo('To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
|
||||
|
||||
|
||||
@ml_group.command('upload-job')
|
||||
@click.argument('job-file', type=click.Path(exists=True, dir_okay=False))
|
||||
@click.option('--overwrite', '-o', is_flag=True, help='Overwrite job if exists by name')
|
||||
@click.pass_context
|
||||
def upload_job(ctx: click.Context, job_file, overwrite):
|
||||
"""Upload experimental ML jobs."""
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
ml_client = MlClient(es_client)
|
||||
|
||||
with open(job_file, 'r') as f:
|
||||
job = json.load(f)
|
||||
|
||||
def safe_upload(func):
|
||||
try:
|
||||
func(name, body)
|
||||
except (elasticsearch.ConflictError, elasticsearch.RequestError) as err:
|
||||
if isinstance(err, elasticsearch.RequestError) and err.error != 'resource_already_exists_exception':
|
||||
client_error(str(err), err, ctx=ctx)
|
||||
|
||||
if overwrite:
|
||||
ctx.invoke(delete_job, job_name=name, job_type=job_type)
|
||||
func(name, body)
|
||||
else:
|
||||
client_error(str(err), err, ctx=ctx)
|
||||
|
||||
try:
|
||||
job_type = job['type']
|
||||
name = job['name']
|
||||
body = job['body']
|
||||
|
||||
if job_type == 'anomaly_detection':
|
||||
safe_upload(ml_client.put_job)
|
||||
elif job_type == 'data_frame_analytic':
|
||||
safe_upload(ml_client.put_data_frame_analytics)
|
||||
elif job_type == 'datafeed':
|
||||
safe_upload(ml_client.put_datafeed)
|
||||
else:
|
||||
client_error(f'Unknown ML job type: {job_type}')
|
||||
|
||||
click.echo(f'Uploaded {job_type} job: {name}')
|
||||
except KeyError as e:
|
||||
client_error(f'{job_file} missing required info: {e}')
|
||||
|
||||
|
||||
@ml_group.command('delete-job')
|
||||
@click.argument('job-name')
|
||||
@click.argument('job-type')
|
||||
@click.pass_context
|
||||
def delete_job(ctx: click.Context, job_name, job_type, verbose=True):
|
||||
"""Remove experimental ML jobs."""
|
||||
es_client: Elasticsearch = ctx.obj['es']
|
||||
ml_client = MlClient(es_client)
|
||||
|
||||
try:
|
||||
if job_type == 'anomaly_detection':
|
||||
ml_client.delete_job(job_name)
|
||||
elif job_type == 'data_frame_analytic':
|
||||
ml_client.delete_data_frame_analytics(job_name)
|
||||
elif job_type == 'datafeed':
|
||||
ml_client.delete_datafeed(job_name)
|
||||
else:
|
||||
client_error(f'Unknown ML job type: {job_type}')
|
||||
except (elasticsearch.NotFoundError, elasticsearch.ConflictError) as e:
|
||||
client_error(str(e), e, ctx=ctx)
|
||||
|
||||
if verbose:
|
||||
click.echo(f'Deleted {job_type} job: {job_name}')
|
||||
@@ -69,3 +69,8 @@ ThresholdValue = NewType("ThresholdValue", int, validate=validate.Range(min=1))
|
||||
TimelineTemplateId = NewType('TimelineTemplateId', str, validate=validate.OneOf(list(TIMELINE_TEMPLATES)))
|
||||
TimelineTemplateTitle = NewType('TimelineTemplateTitle', str, validate=validate.OneOf(TIMELINE_TEMPLATES.values()))
|
||||
UUIDString = NewType('UUIDString', str, validate=validate.Regexp(UUID_PATTERN))
|
||||
|
||||
|
||||
# experimental machine learning features and releases
|
||||
MachineLearningType = Literal['DGA', 'ProblemChild']
|
||||
MachineLearningTypeLower = Literal['dga', 'problemchild']
|
||||
|
||||
@@ -18,6 +18,7 @@ import zipfile
|
||||
from dataclasses import is_dataclass, astuple
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
import eql.utils
|
||||
from eql.utils import load_dump, stream_json_lines
|
||||
@@ -148,6 +149,24 @@ def unzip_and_save(contents, path, member=None, verbose=True):
|
||||
print('Saved files to {}: \n\t- {}'.format(path, '\n\t- '.join(name_list)))
|
||||
|
||||
|
||||
def unzip_to_dict(zipped: zipfile.ZipFile, load_json=True) -> Dict[str, Union[dict, str]]:
|
||||
"""Unzip and load contents to dict with filenames as keys."""
|
||||
bundle = {}
|
||||
for filename in zipped.namelist():
|
||||
if filename.endswith('/'):
|
||||
continue
|
||||
|
||||
fp = Path(filename)
|
||||
contents = zipped.read(filename)
|
||||
|
||||
if load_json and fp.suffix == '.json':
|
||||
contents = json.loads(contents)
|
||||
|
||||
bundle[fp.name] = contents
|
||||
|
||||
return bundle
|
||||
|
||||
|
||||
def event_sort(events, timestamp='@timestamp', date_format='%Y-%m-%dT%H:%M:%S.%f%z', asc=True):
|
||||
"""Sort events from elasticsearch by timestamp."""
|
||||
|
||||
|
||||
-191
@@ -1,191 +0,0 @@
|
||||
# Machine Learning on Domain Generation Algorithm (DGA)
|
||||
|
||||
Several blogs were put out on how you can create and leverage supervised DGA ML models to enrich data within the stack.
|
||||
* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity)
|
||||
* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data)
|
||||
|
||||
You can also find some supplementary and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection)
|
||||
|
||||
We also released a blog on getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst):
|
||||
* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection)
|
||||
|
||||
|
||||
For questions, please reach out to the ML team in the #machine-learning channel of the
|
||||
[Elastic public slack channel](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack)
|
||||
|
||||
They can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning)
|
||||
|
||||
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
|
||||
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
|
||||
|
||||
## Releases
|
||||
|
||||
Models and dependencies will be [released](https://github.com/elastic/detection-rules/releases) as `ML-DGA-YYYMMDD-N`.
|
||||
This tag name is what will need to be passed to the CLI command.
|
||||
|
||||
## Uploading a model and dependencies using the CLI
|
||||
|
||||
### Usage
|
||||
|
||||
```console
|
||||
python -m detection_rules es experimental setup-dga-model -h
|
||||
|
||||
Elasticsearch client:
|
||||
Options:
|
||||
-et, --timeout INTEGER Timeout for elasticsearch client
|
||||
-ep, --es-password TEXT
|
||||
-eu, --es-user TEXT
|
||||
--cloud-id TEXT
|
||||
--elasticsearch-url TEXT
|
||||
|
||||
|
||||
* experimental commands are use at your own risk and may change without warning *
|
||||
|
||||
Usage: detection_rules es experimental setup-dga-model [OPTIONS]
|
||||
|
||||
Upload ML DGA model and dependencies and enrich DNS data.
|
||||
|
||||
Options:
|
||||
-t, --model-tag TEXT Release tag for model files staged in detection-
|
||||
rules (required to download files)
|
||||
-r, --repo TEXT GitHub repository hosting the model file releases
|
||||
(owner/repo)
|
||||
-d, --model-dir DIRECTORY Directory containing local model files
|
||||
--overwrite Overwrite all files if already in the stack
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
```
|
||||
|
||||
### Detailed steps
|
||||
|
||||
#### 1. Upload and setup the model file and dependencies
|
||||
|
||||
Run `python -m detection_rules es <args_or_config> experimental setup-dga-model -t <release-tag>`
|
||||
|
||||
*If updating a new model, you should first uninstall any existing models using `remove-dga-model`*
|
||||
|
||||
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
|
||||
expected pattern for the filenames.
|
||||
|
||||
#### 2. Update packetbeat configuration
|
||||
|
||||
You will need to update your packebeat.yml config file to point to the enrichment pipeline
|
||||
|
||||
Under `Elasticsearch Output` add the following:
|
||||
|
||||
```yaml
|
||||
output.elasticsearch:
|
||||
hosts: ["your-hostname:your-port"]
|
||||
pipeline: dns_enrich_pipeline
|
||||
```
|
||||
|
||||
#### 3. Refresh your packetbeat index
|
||||
|
||||
You can optionally choose to refresh your packetbeat index mapping within Kibana:
|
||||
* navigate to `Stack Management > (Kibana) Index Patterns`
|
||||
* select the applicable packetbeat index
|
||||
* click `refresh field list`
|
||||
|
||||
#### 4. Verify enrichment fields
|
||||
|
||||
Any packetbeat documents with the field `dns.question.registered_domain` should now have the enriched data:
|
||||
`ml_is_dga.*`
|
||||
|
||||
|
||||
## Experimental DGA ML Jobs and Rules
|
||||
|
||||
Once packetbeat data is being enriched, there are some rules and ML jobs which can leverage the enriched fields.
|
||||
The experimental rules and jobs will be staged separate from the model bundle under the [releases](https://github.com/elastic/detection-rules/releases)
|
||||
as `ML-experimental-detections-YYYMMDD-N`. These releases should be considered independent of each other. Any relation
|
||||
to previously released experimental detections will be mentioned in the accompanying readme (such as an update to a rule).
|
||||
|
||||
Note that if a rule is of `type = "machine_learning"`, then it may be dependent on a uploading and running a machine
|
||||
learning job first. If this is the case, it will likely be annotated within the `note` field of the rule.
|
||||
|
||||
#### Uploading rules
|
||||
|
||||
You can then individually upload these rules using the [kibana upload-rule](../CLI.md#uploading-rules-to-kibana) command
|
||||
|
||||
#### Uploading ML Jobs
|
||||
|
||||
Unzip released jobs and then run `python -m detection_rules es <args> experimental upload-ml-job <ml_job.json>`
|
||||
|
||||
To delete a job, run `python -m detection_rules es <args> experimental delete-ml-job <job-name> <job-type>`
|
||||
|
||||
Take note of any errors as some jobs may have dependencies on each other which may require stopping and or removing
|
||||
referenced jobs first.
|
||||
|
||||
|
||||
## For Maintainers
|
||||
|
||||
### Validating release bundles and releasing
|
||||
|
||||
Release assets are expected to be in certain formats with specific naming patterns and json structures.
|
||||
|
||||
#### Filename patterns
|
||||
|
||||
DGA model file naming convention should match the following patterns
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "dga_*_model.json",
|
||||
"dga_ngrams_create": "dga_*_ngrams_create.json",
|
||||
"dga_ngrams_transform_delete": "dga_*_ngrams_transform_delete.json",
|
||||
"dns_enrich_pipeline": "dga_*_ingest_pipeline1.json",
|
||||
"dns_dga_inference_enrich_pipeline": "dga_*_ingest_pipeline2.json"
|
||||
}
|
||||
```
|
||||
|
||||
Experimental detections do not have to match a specific naming pattern but should be in the following file formats:
|
||||
* rules: toml
|
||||
* jobs: json
|
||||
|
||||
#### Uniqueness
|
||||
|
||||
The model file name and hash should be unique or else it will raise a warning in validation. This is important to allow
|
||||
distinction and ascertain information about a bundle by consulting the manifest, based on a unique name
|
||||
|
||||
Release zipped assets, name, and tag name all share the same name. These should follow the following format:
|
||||
* Model releases: `ML-DGA-YYYYMMDD-1`
|
||||
* Detection releases: `ML-experimental-detections-YYYYMMDD-1`
|
||||
|
||||
The trailing digit should be incremented for each release
|
||||
|
||||
Rule and Job names should also be unique
|
||||
|
||||
#### Rule and job structure
|
||||
|
||||
Rules files are only check if they are valid toml, nothing more. Consult existing production rules and schemas for API
|
||||
expectations
|
||||
|
||||
Job files are checked if they are valid toml and contain the following top level fields:
|
||||
* name - job name
|
||||
* type - job type
|
||||
* body - the actual ML job data. The contents are not validated
|
||||
|
||||
#### Validation
|
||||
|
||||
All of these checks are automated and can be called with:
|
||||
`python -m detection_rules dev gh-release validate-ml-dga-asset` - for model bundles
|
||||
`python -m detection_rules dev gh-release validate-ml-detections-asset` for rule/job bundles
|
||||
|
||||
Pay attention to the output to determine any necessary changes. This may not be all inclusive and actual testing on a
|
||||
live stack should always occur even with passing validation before saving to a GitHub release
|
||||
|
||||
#### Including a readme for detections release
|
||||
|
||||
`ML-experimental-detections-*` releases will need to include a readme to provide an overview of the included files
|
||||
|
||||
#### Releasing
|
||||
|
||||
Install dependencies with `pip install -r requirements-dev.txt`
|
||||
|
||||
A release can be created via the cli using `python -m detection_rules dev gh-release create-ml`
|
||||
|
||||
* you can only use a github token
|
||||
* the base directory name and release name must match
|
||||
* you must have write permissions to the repo to create a release
|
||||
* validation also occurs on this, with a prompt to proceed
|
||||
* upon completion, a manifest is uploaded as an asset to the GitHub release
|
||||
|
||||
To test, you can fork the repo and use `--repo <your-fork>` to validate a release is working as expected
|
||||
@@ -0,0 +1,54 @@
|
||||
# Machine Learning on Domain Generation Algorithm (DGA)
|
||||
|
||||
To create and use supervised DGA ML models to enrich data within the stack, check out these Elastic blogs:
|
||||
* Part 1: [Machine learning in cybersecurity: Training supervised models to detect DGA activity](https://www.elastic.co/blog/machine-learning-in-cybersecurity-training-supervised-models-to-detect-dga-activity)
|
||||
* Part 2: [Machine learning in cybersecurity: Detecting DGA activity in network data](https://www.elastic.co/blog/machine-learning-in-cybersecurity-detecting-dga-activity-in-network-data)
|
||||
|
||||
You can also find some supplementary material and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/DGA%20Detection)
|
||||
|
||||
We also released a blog about getting started with DGA using the CLI and Kibana, which also includes a case study of the process applied to the 2020 [SolarWinds supply chain attack](https://www.elastic.co/blog/elastic-security-provides-free-and-open-protections-for-sunburst):
|
||||
* [Combining supervised and unsupervised machine learning for DGA detection](https://www.elastic.co/blog/supervised-and-unsupervised-machine-learning-for-dga-detection)
|
||||
|
||||
|
||||
For questions, please reach out to the ML team in the #machine-learning channel of the
|
||||
[Elastic community Slack workspace](https://www.elastic.co/blog/join-our-elastic-stack-workspace-on-slack)
|
||||
|
||||
The team can also be reached by using the `stack-machine-learning` tag in the [discuss forums](https://discuss.elastic.co/tags/c/elastic-stack/stack-machine-learning)
|
||||
|
||||
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
|
||||
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
|
||||
|
||||
|
||||
## Detailed steps
|
||||
|
||||
#### 1. Upload and setup the model file and dependencies
|
||||
|
||||
Run `python -m detection_rules es <args_or_config> experimental ml setup -t <release-tag>`
|
||||
|
||||
*If updating a new model, you should first uninstall any existing models using `remove-model`*
|
||||
|
||||
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
|
||||
expected pattern for the filenames.
|
||||
|
||||
#### 2. Update packetbeat configuration
|
||||
|
||||
You will need to update your packetbeat.yml config file to point to the enrichment pipeline
|
||||
|
||||
Under `Elasticsearch Output` add the following:
|
||||
|
||||
```yaml
|
||||
output.elasticsearch:
|
||||
hosts: ["your-hostname:your-port"]
|
||||
pipeline: dns_enrich_pipeline
|
||||
```
|
||||
|
||||
#### 3. Refresh your packetbeat index
|
||||
|
||||
You can optionally choose to refresh your packetbeat index mapping from within Kibana:
|
||||
* Navigate to `Stack Management > (Kibana) Index Patterns`
|
||||
* Select the appropriate packetbeat index
|
||||
* Click `refresh field list`
|
||||
|
||||
#### 4. Verify enrichment fields
|
||||
|
||||
Any packetbeat documents with the field `dns.question.registered_domain` should now be enriched with `ml_is_dga.*`
|
||||
@@ -0,0 +1,28 @@
|
||||
# Experimental ML Jobs and Rules
|
||||
|
||||
The ingest pipeline enriches process events by adding additional fields, which are used to power several rules.
|
||||
The experimental rules and jobs are staged separately from the model bundle under [releases](https://github.com/elastic/detection-rules/releases), with the tag `ML-experimental-detections-YYYMMDD-N`. New releases with this tag may contain either updates to existing rules or new experimental detcetions.
|
||||
|
||||
Note that if a rule is of `type = "machine_learning"`, then it may be dependent on uploading and running a machine
|
||||
learning job first. If this is the case, it will likely be annotated within the `note` field of the rule.
|
||||
|
||||
### Uploading rules
|
||||
|
||||
Unzip the release bundle and upload these rules individually.
|
||||
|
||||
Rules are now stored in ndjson format and can be imported into Kibana via the security app detections page.
|
||||
|
||||
Earlier releases stored the rules in toml format. These can be uploaded using the
|
||||
[7.12 branch](https://github.com/elastic/detection-rules/tree/7.12) CLI using the
|
||||
[kibana upload-rule](../../CLI.md#uploading-rules-to-kibana) command
|
||||
|
||||
### Uploading ML Jobs and Datafeeds
|
||||
|
||||
Unzip the release bundle and then run `python -m detection_rules es <args> experimental ml upload-job <ml_job.json>`
|
||||
|
||||
To delete a job/datafeed, run `python -m detection_rules es <args> experimental ml delete-job <job-name> <job-type>`
|
||||
|
||||
The CLI automatically identifies whether the provided input file is an ML job or datafeed.
|
||||
|
||||
Take note of any errors as the jobs and datafeeds may have dependencies on each other which may require stopping and/or removing
|
||||
referenced jobs/datafeeds first.
|
||||
@@ -0,0 +1,64 @@
|
||||
# ProblemChild in the Elastic Stack
|
||||
|
||||
ProblemChild helps detect anomalous activity in Windows process events by:
|
||||
1) Classifying events as malicious vs benign
|
||||
2) Identifying anomalous events based on rare parent-child process relationships
|
||||
|
||||
An end-to-end blog on how to build the ProblemChild framework from scratch for your environment can be found [here](https://www.elastic.co/blog/problemchild-detecting-living-off-the-land-attacks).
|
||||
|
||||
You can also find some supplementary material for the blog and examples [here](https://github.com/elastic/examples/tree/master/Machine%20Learning/ProblemChild)
|
||||
|
||||
We also released a blog about getting started with ProblemChild using the CLI and Kibana:
|
||||
* [ProblemChild Release Blog](link to blog)
|
||||
|
||||
|
||||
*Note: in order to use these ML features, you must have a platinum or higher [subscription](https://www.elastic.co/subscriptions)*
|
||||
*Note: the ML features are considered experimental in Kibana as well as this rules CLI*
|
||||
|
||||
|
||||
## Detailed steps
|
||||
|
||||
#### 1. Upload and setup the model file and dependencies
|
||||
|
||||
Run `python -m detection_rules es <args_or_config> experimental ml setup -t <release-tag>`
|
||||
|
||||
*If updating a new model, you should first uninstall any existing models using `remove-model`*
|
||||
|
||||
You can also upload files locally using the `-d` option, so long as the naming convention of the files match the
|
||||
expected pattern for the filenames.
|
||||
|
||||
#### 2. Update index pipeline configuration
|
||||
|
||||
You will need to update your index (containing Windows process event data) settings to point to the ProblemChild enrichment pipeline.
|
||||
|
||||
You can do this by running the following command in your Dev Tools console:
|
||||
```
|
||||
PUT your-index-pattern/_settings
|
||||
{
|
||||
"index": {
|
||||
"default_pipeline": "ML_ProblemChild_ingest_pipeline"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If you wish to stop enriching your documents using ProblemChild, run the following command in your dev Tools console:
|
||||
```
|
||||
PUT your-index-pattern/_settings
|
||||
{
|
||||
"index": {
|
||||
"default_pipeline": null
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### 3. Refresh your indexes
|
||||
|
||||
You can optionally choose to refresh your index mapping from within Kibana:
|
||||
* Navigate to `Stack Management > (Kibana) Index Patterns`
|
||||
* Select the appropriate indexes
|
||||
* Click `refresh field list`
|
||||
|
||||
#### 4. Verify enrichment fields
|
||||
|
||||
Any documents corresponding to Windows process events should now be enriched with `problemchild.*`
|
||||
@@ -0,0 +1,102 @@
|
||||
# Experimental machine learning
|
||||
|
||||
This repo contains some additional information and files to use experimental[*](#what-does-experimental-mean-in-this-context) machine learning features and detections
|
||||
|
||||
## Features
|
||||
* [DGA](DGA.md)
|
||||
* [ProblemChild](problem_child.md)
|
||||
* [experimental detections](experimental-detections.md)
|
||||
|
||||
## Releases
|
||||
|
||||
There are separate [releases](https://github.com/elastic/detection-rules/releases) for:
|
||||
* DGA: `ML-DGA-*`
|
||||
* problem child: `ML-ProblemChild-*`
|
||||
* experimental detections: `ML-experimental-detections-*`
|
||||
|
||||
Releases will use the tag `ML-TYPE-YYYMMDD-N`, which will be needed for uploading the model using the CLI.
|
||||
|
||||
|
||||
## CLI
|
||||
|
||||
Support commands can be found under `python -m detection_rules es <es args> experimental ml -h`
|
||||
|
||||
```console
|
||||
Elasticsearch client:
|
||||
Options:
|
||||
-et, --timeout INTEGER Timeout for elasticsearch client
|
||||
-ep, --es-password TEXT
|
||||
-eu, --es-user TEXT
|
||||
--elasticsearch-url TEXT
|
||||
--cloud-id TEXT
|
||||
|
||||
|
||||
* experimental commands are use at your own risk and may change without warning *
|
||||
|
||||
Usage: detection_rules es experimental ml [OPTIONS] COMMAND [ARGS]...
|
||||
|
||||
Experimental machine learning commands.
|
||||
|
||||
Options:
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
Commands:
|
||||
check-files Check ML model files on an elasticsearch...
|
||||
delete-job Remove experimental ML jobs.
|
||||
remove-model Remove ML model files.
|
||||
remove-scripts-pipelines Remove ML scripts and pipeline files.
|
||||
setup Upload ML model and dependencies to enrich data.
|
||||
upload-job Upload experimental ML jobs.
|
||||
```
|
||||
|
||||
## Managing a model and dependencies using the CLI
|
||||
|
||||
### Installing
|
||||
|
||||
```console
|
||||
python -m detection_rules es experimental ml setup -h
|
||||
|
||||
Elasticsearch client:
|
||||
Options:
|
||||
-et, --timeout INTEGER Timeout for elasticsearch client
|
||||
-ep, --es-password TEXT
|
||||
-eu, --es-user TEXT
|
||||
--cloud-id TEXT
|
||||
--elasticsearch-url TEXT
|
||||
|
||||
|
||||
* experimental commands are use at your own risk and may change without warning *
|
||||
|
||||
Usage: detection_rules es experimental ml setup [OPTIONS]
|
||||
|
||||
Upload ML model and dependencies to enrich data.
|
||||
|
||||
Options:
|
||||
-t, --model-tag TEXT Release tag for model files staged in detection-
|
||||
rules (required to download files)
|
||||
-r, --repo TEXT GitHub repository hosting the model file releases
|
||||
(owner/repo)
|
||||
-d, --model-dir DIRECTORY Directory containing local model files
|
||||
--overwrite Overwrite all files if already in the stack
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
```
|
||||
|
||||
### Removing
|
||||
|
||||
To remove the ML bundle, you will need to remove the pipelines and scripts first and then the model.
|
||||
|
||||
You can do this by running:
|
||||
* `python -m detection_rules es experimental ml remove-pipeline-scripts --dga --problemchild`
|
||||
* `python -m detection_rules es experimental ml remove-model <model-id>`
|
||||
|
||||
|
||||
----
|
||||
|
||||
##### What does experimental mean in this context?
|
||||
|
||||
Experimental model bundles (models, scripts, and pipelines), rules, and jobs are components which are currently in
|
||||
development and so may not have completed the testing or scrutiny which full production detections are subjected to.
|
||||
|
||||
It may also make use of features which are not yet GA and so may be subject to change and are not covered by the support
|
||||
SLA of general release (GA) features. Some of these features may also never make it to GA.
|
||||
Reference in New Issue
Block a user