Files
sigma-rules/detection_rules/eswrap.py
T

402 lines
18 KiB
Python
Raw Normal View History

2020-06-29 23:17:38 -06:00
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2021-03-03 22:12:11 -09:00
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
2020-06-29 23:17:38 -06:00
"""Elasticsearch cli commands."""
2020-06-29 23:17:38 -06:00
import json
import os
import time
from collections import defaultdict
from typing import Union
2020-06-29 23:17:38 -06:00
import click
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch.client.async_search import AsyncSearchClient
2020-06-29 23:17:38 -06:00
import kql
2020-06-29 23:17:38 -06:00
from .main import root
from .misc import add_params, client_error, elasticsearch_options, get_elasticsearch_client
2021-03-24 10:24:32 -06:00
from .rule import TOMLRule
from .rule_loader import rta_mappings, RuleCollection
from .utils import format_command_options, normalize_timing_and_sort, unix_time_to_formatted, get_path
2020-06-29 23:17:38 -06:00
COLLECTION_DIR = get_path('collections')
MATCH_ALL = {'bool': {'filter': [{'match_all': {}}]}}
2020-06-29 23:17:38 -06:00
def add_range_to_dsl(dsl_filter, start_time, end_time='now'):
dsl_filter.append(
{"range": {"@timestamp": {"gt": start_time, "lte": end_time, "format": "strict_date_optional_time"}}}
)
2020-06-29 23:17:38 -06:00
class RtaEvents(object):
2020-06-29 23:17:38 -06:00
"""Events collected from Elasticsearch."""
def __init__(self, events):
self.events: dict = self._normalize_event_timing(events)
2020-06-29 23:17:38 -06:00
@staticmethod
def _normalize_event_timing(events):
"""Normalize event timestamps and sort."""
for agent_type, _events in events.items():
events[agent_type] = normalize_timing_and_sort(_events)
return events
@staticmethod
def _get_dump_dir(rta_name=None, host_id=None):
2020-06-29 23:17:38 -06:00
"""Prepare and get the dump path."""
if rta_name:
dump_dir = get_path('unit_tests', 'data', 'true_positives', rta_name)
os.makedirs(dump_dir, exist_ok=True)
return dump_dir
else:
time_str = time.strftime('%Y%m%dT%H%M%SL')
dump_dir = os.path.join(COLLECTION_DIR, host_id or 'unknown_host', time_str)
2020-06-29 23:17:38 -06:00
os.makedirs(dump_dir, exist_ok=True)
return dump_dir
def evaluate_against_rule_and_update_mapping(self, rule_id, rta_name, verbose=True):
"""Evaluate a rule against collected events and update mapping."""
from .utils import combine_sources, evaluate
rule = next((rule for rule in RuleCollection.default() if rule.id == rule_id), None)
assert rule is not None, f"Unable to find rule with ID {rule_id}"
2020-06-29 23:17:38 -06:00
merged_events = combine_sources(*self.events.values())
filtered = evaluate(rule, merged_events)
if filtered:
sources = [e['agent']['type'] for e in filtered]
mapping_update = rta_mappings.add_rule_to_mapping_file(rule, len(filtered), rta_name, *sources)
if verbose:
click.echo('Updated rule-mapping file with: \n{}'.format(json.dumps(mapping_update, indent=2)))
else:
if verbose:
click.echo('No updates to rule-mapping file; No matching results')
def echo_events(self, pager=False, pretty=True):
"""Print events to stdout."""
echo_fn = click.echo_via_pager if pager else click.echo
echo_fn(json.dumps(self.events, indent=2 if pretty else None, sort_keys=True))
def save(self, rta_name=None, dump_dir=None, host_id=None):
2020-06-29 23:17:38 -06:00
"""Save collected events."""
assert self.events, 'Nothing to save. Run Collector.run() method first or verify logging'
2020-06-29 23:17:38 -06:00
dump_dir = dump_dir or self._get_dump_dir(rta_name=rta_name, host_id=host_id)
2020-06-29 23:17:38 -06:00
for source, events in self.events.items():
path = os.path.join(dump_dir, source + '.jsonl')
with open(path, 'w') as f:
f.writelines([json.dumps(e, sort_keys=True) + '\n' for e in events])
click.echo('{} events saved to: {}'.format(len(events), path))
class CollectEvents(object):
"""Event collector for elastic stack."""
def __init__(self, client, max_events=3000):
self.client: Elasticsearch = client
self.max_events = max_events
2020-06-29 23:17:38 -06:00
def _build_timestamp_map(self, index_str):
"""Build a mapping of indexes to timestamp data formats."""
mappings = self.client.indices.get_mapping(index=index_str)
timestamp_map = {n: m['mappings'].get('properties', {}).get('@timestamp', {}) for n, m in mappings.items()}
return timestamp_map
def _get_last_event_time(self, index_str, dsl=None):
2020-06-29 23:17:38 -06:00
"""Get timestamp of most recent event."""
last_event = self.client.search(dsl, index_str, size=1, sort='@timestamp:desc')['hits']['hits']
if not last_event:
return
2020-06-29 23:17:38 -06:00
last_event = last_event[0]
2020-06-29 23:17:38 -06:00
index = last_event['_index']
timestamp = last_event['_source']['@timestamp']
timestamp_map = self._build_timestamp_map(index_str)
2020-06-29 23:17:38 -06:00
event_date_format = timestamp_map[index].get('format', '').split('||')
# there are many native supported date formats and even custom data formats, but most, including beats use the
# default `strict_date_optional_time`. It would be difficult to try to account for all possible formats, so this
# will work on the default and unix time.
if set(event_date_format) & {'epoch_millis', 'epoch_second'}:
timestamp = unix_time_to_formatted(timestamp)
return timestamp
@staticmethod
def _prep_query(query, language, index, start_time=None, end_time=None):
"""Prep a query for search."""
index_str = ','.join(index if isinstance(index, (list, tuple)) else index.split(','))
lucene_query = query if language == 'lucene' else None
if language in ('kql', 'kuery'):
formatted_dsl = {'query': kql.to_dsl(query)}
elif language == 'eql':
formatted_dsl = {'query': query, 'filter': MATCH_ALL}
elif language == 'lucene':
formatted_dsl = {'query': {'bool': {'filter': []}}}
elif language == 'dsl':
formatted_dsl = {'query': query}
else:
raise ValueError('Unknown search language')
if start_time or end_time:
end_time = end_time or 'now'
dsl = formatted_dsl['filter']['bool']['filter'] if language == 'eql' else \
formatted_dsl['query']['bool'].setdefault('filter', [])
add_range_to_dsl(dsl, start_time, end_time)
return index_str, formatted_dsl, lucene_query
def search(self, query, language, index: Union[str, list] = '*', start_time=None, end_time=None, size=None,
**kwargs):
"""Search an elasticsearch instance."""
index_str, formatted_dsl, lucene_query = self._prep_query(query=query, language=language, index=index,
start_time=start_time, end_time=end_time)
formatted_dsl.update(size=size or self.max_events)
if language == 'eql':
results = self.client.eql.search(body=formatted_dsl, index=index_str, **kwargs)['hits']
results = results.get('events') or results.get('sequences', [])
else:
results = self.client.search(body=formatted_dsl, q=lucene_query, index=index_str,
allow_no_indices=True, ignore_unavailable=True, **kwargs)['hits']['hits']
return results
2021-03-24 10:24:32 -06:00
def search_from_rule(self, *rules: TOMLRule, start_time=None, end_time='now', size=None):
"""Search an elasticsearch instance using a rule."""
from .misc import nested_get
async_client = AsyncSearchClient(self.client)
survey_results = {}
def parse_unique_field_results(rule_type, unique_fields, search_results):
parsed_results = defaultdict(lambda: defaultdict(int))
hits = search_results['hits']
hits = hits['hits'] if rule_type != 'eql' else hits.get('events') or hits.get('sequences', [])
for hit in hits:
for field in unique_fields:
match = nested_get(hit['_source'], field)
match = ','.join(sorted(match)) if isinstance(match, list) else match
parsed_results[field][match] += 1
# if rule.type == eql, structure is different
return {'results': parsed_results} if parsed_results else {}
multi_search = []
multi_search_rules = []
async_searches = {}
eql_searches = {}
for rule in rules:
if not rule.query:
continue
index_str, formatted_dsl, lucene_query = self._prep_query(query=rule.query,
language=rule.contents.get('language'),
index=rule.contents.get('index', '*'),
start_time=start_time,
end_time=end_time)
formatted_dsl.update(size=size or self.max_events)
# prep for searches: msearch for kql | async search for lucene | eql client search for eql
if rule.contents['language'] == 'kuery':
multi_search_rules.append(rule)
multi_search.append(json.dumps(
{'index': index_str, 'allow_no_indices': 'true', 'ignore_unavailable': 'true'}))
multi_search.append(json.dumps(formatted_dsl))
elif rule.contents['language'] == 'lucene':
# wait for 0 to try and force async with no immediate results (not guaranteed)
result = async_client.submit(body=formatted_dsl, q=rule.query, index=index_str,
allow_no_indices=True, ignore_unavailable=True,
wait_for_completion_timeout=0)
if result['is_running'] is True:
async_searches[rule] = result['id']
else:
survey_results[rule.id] = parse_unique_field_results(rule.type, rule.unique_fields,
result['response'])
elif rule.contents['language'] == 'eql':
eql_body = {
'index': index_str,
'params': {'ignore_unavailable': 'true', 'allow_no_indices': 'true'},
'body': {'query': rule.query, 'filter': formatted_dsl['filter']}
}
eql_searches[rule] = eql_body
# assemble search results
multi_search_results = self.client.msearch('\n'.join(multi_search) + '\n')
for index, result in enumerate(multi_search_results['responses']):
try:
rule = multi_search_rules[index]
survey_results[rule.id] = parse_unique_field_results(rule.type, rule.unique_fields, result)
except KeyError:
survey_results[multi_search_rules[index].id] = {'error_retrieving_results': True}
for rule, search_args in eql_searches.items():
try:
result = self.client.eql.search(**search_args)
survey_results[rule.id] = parse_unique_field_results(rule.type, rule.unique_fields, result)
except (elasticsearch.NotFoundError, elasticsearch.RequestError) as e:
survey_results[rule.id] = {'error_retrieving_results': True, 'error': e.info['error']['reason']}
for rule, async_id in async_searches.items():
result = async_client.get(async_id)['response']
survey_results[rule.id] = parse_unique_field_results(rule.type, rule.unique_fields, result)
return survey_results
def count(self, query, language, index: Union[str, list], start_time=None, end_time='now'):
"""Get a count of documents from elasticsearch."""
index_str, formatted_dsl, lucene_query = self._prep_query(query=query, language=language, index=index,
start_time=start_time, end_time=end_time)
# EQL API has no count endpoint
if language == 'eql':
results = self.search(query=query, language=language, index=index, start_time=start_time, end_time=end_time,
size=1000)
return len(results)
else:
return self.client.count(body=formatted_dsl, index=index_str, q=lucene_query, allow_no_indices=True,
ignore_unavailable=True)['count']
def count_from_rule(self, *rules, start_time=None, end_time='now'):
"""Get a count of documents from elasticsearch using a rule."""
survey_results = {}
2020-06-29 23:17:38 -06:00
for rule in rules:
rule_results = {'rule_id': rule.id, 'name': rule.name}
2020-06-29 23:17:38 -06:00
if not rule.query:
continue
try:
rule_results['search_count'] = self.count(query=rule.query, language=rule.contents.get('language'),
index=rule.contents.get('index', '*'), start_time=start_time,
end_time=end_time)
except (elasticsearch.NotFoundError, elasticsearch.RequestError):
rule_results['search_count'] = -1
survey_results[rule.id] = rule_results
return survey_results
class CollectRtaEvents(CollectEvents):
"""Collect RTA events from elasticsearch."""
2020-06-29 23:17:38 -06:00
@staticmethod
def _group_events_by_type(events):
"""Group events by agent.type."""
event_by_type = {}
for event in events:
2020-06-29 23:17:38 -06:00
event_by_type.setdefault(event['_source']['agent']['type'], []).append(event['_source'])
return event_by_type
def run(self, dsl, indexes, start_time):
2020-06-29 23:17:38 -06:00
"""Collect the events."""
results = self.search(dsl, language='dsl', index=indexes, start_time=start_time, end_time='now', size=5000,
sort='@timestamp:asc')
events = self._group_events_by_type(results)
return RtaEvents(events)
2020-06-29 23:17:38 -06:00
@root.command('normalize-data')
@click.argument('events-file', type=click.File('r'))
def normalize_data(events_file):
"""Normalize Elasticsearch data timestamps and sort."""
file_name = os.path.splitext(os.path.basename(events_file.name))[0]
events = RtaEvents({file_name: [json.loads(e) for e in events_file.readlines()]})
events.save(dump_dir=os.path.dirname(events_file.name))
@root.group('es')
@add_params(*elasticsearch_options)
@click.pass_context
def es_group(ctx: click.Context, **kwargs):
"""Commands for integrating with Elasticsearch."""
ctx.ensure_object(dict)
# only initialize an es client if the subcommand is invoked without help (hacky)
if click.get_os_args()[-1] in ctx.help_option_names:
click.echo('Elasticsearch client:')
click.echo(format_command_options(ctx))
else:
ctx.obj['es'] = get_elasticsearch_client(ctx=ctx, **kwargs)
2020-06-29 23:17:38 -06:00
@es_group.command('collect-events')
@click.argument('host-id')
@click.option('--query', '-q', help='KQL query to scope search')
2020-06-29 23:17:38 -06:00
@click.option('--index', '-i', multiple=True, help='Index(es) to search against (default: all indexes)')
@click.option('--rta-name', '-r', help='Name of RTA in order to save events directly to unit tests data directory')
@click.option('--rule-id', help='Updates rule mapping in rule-mapping.yml file (requires --rta-name)')
@click.option('--view-events', is_flag=True, help='Print events after saving')
@click.pass_context
def collect_events(ctx, host_id, query, index, rta_name, rule_id, view_events):
2020-06-29 23:17:38 -06:00
"""Collect events from Elasticsearch."""
client: Elasticsearch = ctx.obj['es']
dsl = kql.to_dsl(query) if query else MATCH_ALL
dsl['bool'].setdefault('filter', []).append({'bool': {'should': [{'match_phrase': {'host.id': host_id}}]}})
2020-06-29 23:17:38 -06:00
try:
collector = CollectRtaEvents(client)
start = time.time()
click.pause('Press any key once detonation is complete ...')
start_time = f'now-{round(time.time() - start) + 5}s'
events = collector.run(dsl, index or '*', start_time)
events.save(rta_name=rta_name, host_id=host_id)
2020-06-29 23:17:38 -06:00
if rta_name and rule_id:
events.evaluate_against_rule_and_update_mapping(rule_id, rta_name)
2020-06-29 23:17:38 -06:00
if view_events and events.events:
events.echo_events(pager=True)
2020-06-29 23:17:38 -06:00
return events
except AssertionError as e:
error_msg = 'No events collected! Verify events are streaming and that the agent-hostname is correct'
client_error(error_msg, e, ctx=ctx)
@es_group.command('index-rules')
@click.option('--query', '-q', help='Optional KQL query to limit to specific rules')
@click.option('--from-file', '-f', type=click.File('r'), help='Load a previously saved uploadable bulk file')
@click.option('--save_files', '-s', is_flag=True, help='Optionally save the bulk request to a file')
@click.pass_context
def index_repo(ctx: click.Context, query, from_file, save_files):
"""Index rules based on KQL search results to an elasticsearch instance."""
from .main import generate_rules_index
es_client: Elasticsearch = ctx.obj['es']
if from_file:
bulk_upload_docs = from_file.read()
# light validation only
try:
index_body = [json.loads(line) for line in bulk_upload_docs.splitlines()]
click.echo(f'{len([r for r in index_body if "rule" in r])} rules included')
except json.JSONDecodeError:
client_error(f'Improperly formatted bulk request file: {from_file.name}')
else:
bulk_upload_docs, importable_rules_docs = ctx.invoke(generate_rules_index, query=query, save_files=save_files)
es_client.bulk(bulk_upload_docs)
@es_group.group('experimental')
def es_experimental():
"""[Experimental] helper commands for integrating with Elasticsearch."""
click.secho('\n* experimental commands are use at your own risk and may change without warning *\n')