Files
sigma-rules/detection_rules/eswrap.py
T

558 lines
22 KiB
Python
Raw Normal View History

2020-06-29 23:17:38 -06:00
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2021-03-03 22:12:11 -09:00
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
2020-06-29 23:17:38 -06:00
"""Elasticsearch cli commands."""
2020-06-29 23:17:38 -06:00
import json
import sys
2020-06-29 23:17:38 -06:00
import time
from collections import defaultdict
from pathlib import Path
from typing import IO, Any
2020-06-29 23:17:38 -06:00
import click
import elasticsearch
import kql # type: ignore[reportMissingTypeStubs]
from elasticsearch import Elasticsearch
from elasticsearch.client import AsyncSearchClient
2020-06-29 23:17:38 -06:00
2024-08-06 18:07:12 -04:00
from .config import parse_rules_config
2020-06-29 23:17:38 -06:00
from .main import root
from .misc import add_params, elasticsearch_options, get_elasticsearch_client, nested_get, raise_client_error
2021-03-24 10:24:32 -06:00
from .rule import TOMLRule
2025-03-05 12:35:57 +01:00
from .rule_loader import RuleCollection
from .utils import event_sort, format_command_options, get_path, normalize_timing_and_sort, unix_time_to_formatted
COLLECTION_DIR = get_path(["collections"])
MATCH_ALL: dict[str, dict[str, Any]] = {"bool": {"filter": [{"match_all": {}}]}}
2024-08-06 18:07:12 -04:00
RULES_CONFIG = parse_rules_config()
2020-06-29 23:17:38 -06:00
def add_range_to_dsl(dsl_filter: list[dict[str, Any]], start_time: str, end_time: str = "now") -> None:
dsl_filter.append(
{
"range": {
"@timestamp": {
"gt": start_time,
"lte": end_time,
"format": "strict_date_optional_time",
},
},
}
)
2020-06-29 23:17:38 -06:00
def parse_unique_field_results(
rule_type: str,
unique_fields: list[str],
search_results: dict[str, Any],
) -> dict[str, Any]:
parsed_results: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
hits = search_results["hits"]
hits = hits["hits"] if rule_type != "eql" else hits.get("events") or hits.get("sequences", [])
2022-09-06 15:53:47 -06:00
for hit in hits:
for field in unique_fields:
if "events" in hit:
match: list[Any] = []
for event in hit["events"]:
matched = nested_get(event["_source"], field)
match.extend([matched] if not isinstance(matched, list) else matched) # type: ignore[reportUnknownArgumentType]
2022-09-13 11:38:29 -04:00
if not match:
continue
else:
match = nested_get(hit["_source"], field)
2022-09-13 11:38:29 -04:00
if not match:
continue
2022-09-06 15:53:47 -06:00
match = ",".join(sorted(match)) if isinstance(match, list) else match # type: ignore[reportUnknownArgumentType]
parsed_results[field][match] += 1 # type: ignore[reportUnknownArgumentType]
2022-09-06 15:53:47 -06:00
# if rule.type == eql, structure is different
return {"results": parsed_results} if parsed_results else {}
2022-09-06 15:53:47 -06:00
2025-03-05 12:35:57 +01:00
class Events:
2020-06-29 23:17:38 -06:00
"""Events collected from Elasticsearch."""
def __init__(self, events: dict[str, Any]) -> None:
self.events = self._normalize_event_timing(events)
2020-06-29 23:17:38 -06:00
@staticmethod
def _normalize_event_timing(events: dict[str, Any]) -> dict[str, Any]:
2020-06-29 23:17:38 -06:00
"""Normalize event timestamps and sort."""
for agent_type, _events in events.items():
events[agent_type] = normalize_timing_and_sort(_events)
return events
@staticmethod
def _get_dump_dir(
rta_name: str | None = None,
host_id: str | None = None,
host_os_family: str | None = None,
) -> Path:
2020-06-29 23:17:38 -06:00
"""Prepare and get the dump path."""
2023-06-12 20:03:33 +00:00
if rta_name and host_os_family:
dump_dir = get_path(["unit_tests", "data", "true_positives", rta_name, host_os_family])
dump_dir.mkdir(parents=True, exist_ok=True)
2020-06-29 23:17:38 -06:00
return dump_dir
time_str = time.strftime("%Y%m%dT%H%M%SL")
dump_dir = COLLECTION_DIR / (host_id or "unknown_host") / time_str
dump_dir.mkdir(parents=True, exist_ok=True)
return dump_dir
2020-06-29 23:17:38 -06:00
def evaluate_against_rule(self, rule_id: str, verbose: bool = True) -> list[Any]:
2020-06-29 23:17:38 -06:00
"""Evaluate a rule against collected events and update mapping."""
2022-09-06 15:53:47 -06:00
rule = RuleCollection.default().id_map.get(rule_id)
if not rule:
raise ValueError(f"Unable to find rule with ID {rule_id}")
2020-06-29 23:17:38 -06:00
merged_events = combine_sources(*self.events.values())
2024-08-06 18:07:12 -04:00
filtered = evaluate(rule, merged_events, normalize_kql_keywords=RULES_CONFIG.normalize_kql_keywords)
2020-06-29 23:17:38 -06:00
2025-03-05 12:35:57 +01:00
if verbose:
click.echo("Matching results found")
2020-06-29 23:17:38 -06:00
2025-03-05 12:35:57 +01:00
return filtered
2020-06-29 23:17:38 -06:00
def echo_events(self, pager: bool = False, pretty: bool = True) -> None:
2020-06-29 23:17:38 -06:00
"""Print events to stdout."""
echo_fn = click.echo_via_pager if pager else click.echo
echo_fn(json.dumps(self.events, indent=2 if pretty else None, sort_keys=True))
def save(self, rta_name: str | None = None, dump_dir: Path | None = None, host_id: str | None = None) -> None:
2020-06-29 23:17:38 -06:00
"""Save collected events."""
if not self.events:
raise ValueError("Nothing to save. Run Collector.run() method first or verify logging")
2020-06-29 23:17:38 -06:00
2023-06-12 20:03:33 +00:00
host_os_family = None
for key in self.events:
if self.events.get(key, {})[0].get("host", {}).get("id") == host_id:
host_os_family = self.events.get(key, {})[0].get("host", {}).get("os").get("family")
2023-06-12 20:03:33 +00:00
break
if not host_os_family:
click.echo(f"Unable to determine host.os.family for host_id: {host_id}")
host_os_family = click.prompt(
"Please enter the host.os.family for this host_id",
type=click.Choice(["windows", "macos", "linux"]),
default="windows",
)
2023-06-12 20:03:33 +00:00
dump_dir = dump_dir or self._get_dump_dir(rta_name=rta_name, host_id=host_id, host_os_family=host_os_family)
2020-06-29 23:17:38 -06:00
for source, events in self.events.items():
path = dump_dir / (source + ".ndjson")
with path.open("w") as f:
f.writelines([json.dumps(e, sort_keys=True) + "\n" for e in events])
click.echo(f"{len(events)} events saved to: {path}")
2020-06-29 23:17:38 -06:00
class CollectEvents:
2020-06-29 23:17:38 -06:00
"""Event collector for elastic stack."""
def __init__(self, client: Elasticsearch, max_events: int = 3000) -> None:
self.client = client
self.max_events = max_events
2020-06-29 23:17:38 -06:00
def _build_timestamp_map(self, index: str) -> dict[str, Any]:
2020-06-29 23:17:38 -06:00
"""Build a mapping of indexes to timestamp data formats."""
mappings = self.client.indices.get_mapping(index=index)
return {n: m["mappings"].get("properties", {}).get("@timestamp", {}) for n, m in mappings.items()}
2020-06-29 23:17:38 -06:00
def _get_last_event_time(self, index: str, dsl: dict[str, Any] | None = None) -> None | str:
2020-06-29 23:17:38 -06:00
"""Get timestamp of most recent event."""
last_event = self.client.search(query=dsl, index=index, size=1, sort="@timestamp:desc")["hits"]["hits"]
if not last_event:
return None
2020-06-29 23:17:38 -06:00
last_event = last_event[0]
index = last_event["_index"]
timestamp = last_event["_source"]["@timestamp"]
timestamp_map = self._build_timestamp_map(index)
event_date_format = timestamp_map[index].get("format", "").split("||")
2020-06-29 23:17:38 -06:00
# there are many native supported date formats and even custom data formats, but most, including beats use the
# default `strict_date_optional_time`. It would be difficult to try to account for all possible formats, so this
# will work on the default and unix time.
if set(event_date_format) & {"epoch_millis", "epoch_second"}:
2020-06-29 23:17:38 -06:00
timestamp = unix_time_to_formatted(timestamp)
return timestamp
@staticmethod
def _prep_query(
query: str | dict[str, Any],
language: str,
index: str | list[str] | tuple[str],
start_time: str | None = None,
end_time: str | None = None,
) -> tuple[str, dict[str, Any], str | None]:
"""Prep a query for search."""
index_str = ",".join(index if isinstance(index, (list | tuple)) else index.split(","))
lucene_query = str(query) if language == "lucene" else None
if language in ("kql", "kuery"):
formatted_dsl = {"query": kql.to_dsl(query)} # type: ignore[reportUnknownMemberType]
elif language == "eql":
formatted_dsl = {"query": query, "filter": MATCH_ALL}
elif language == "lucene":
formatted_dsl: dict[str, Any] = {"query": {"bool": {"filter": []}}}
elif language == "dsl":
formatted_dsl = {"query": query}
else:
raise ValueError(f"Unknown search language: {language}")
if start_time or end_time:
end_time = end_time or "now"
dsl = (
formatted_dsl["filter"]["bool"]["filter"]
if language == "eql"
else formatted_dsl["query"]["bool"].setdefault("filter", [])
)
if not start_time:
raise ValueError("No start time provided")
add_range_to_dsl(dsl, start_time, end_time)
return index_str, formatted_dsl, lucene_query
def search( # noqa: PLR0913
self,
query: str | dict[str, Any],
language: str,
index: str | list[str] = "*",
start_time: str | None = None,
end_time: str | None = None,
size: int | None = None,
**kwargs: Any,
) -> list[Any]:
"""Search an elasticsearch instance."""
index_str, formatted_dsl, lucene_query = self._prep_query(
query=query, language=language, index=index, start_time=start_time, end_time=end_time
)
formatted_dsl.update(size=size or self.max_events)
if language == "eql":
results = self.client.eql.search(body=formatted_dsl, index=index_str, **kwargs)["hits"]
results = results.get("events") or results.get("sequences", [])
else:
results = self.client.search(
body=formatted_dsl,
q=lucene_query,
index=index_str,
allow_no_indices=True,
ignore_unavailable=True,
**kwargs,
)["hits"]["hits"]
return results
def search_from_rule(
self,
rules: RuleCollection,
start_time: str | None = None,
end_time: str = "now",
size: int | None = None,
) -> dict[str, Any]:
"""Search an elasticsearch instance using a rule."""
async_client = AsyncSearchClient(self.client)
survey_results: dict[str, Any] = {}
multi_search: list[dict[str, Any]] = []
multi_search_rules: list[TOMLRule] = []
async_searches: list[tuple[TOMLRule, Any]] = []
eql_searches: list[tuple[TOMLRule, dict[str, Any]]] = []
for rule in rules:
if not rule.contents.data.get("query"):
continue
language = rule.contents.data.get("language")
query = rule.contents.data.query # type: ignore[reportAttributeAccessIssue]
2022-09-06 15:53:47 -06:00
rule_type = rule.contents.data.type
index_str, formatted_dsl, _ = self._prep_query(
query=query, # type: ignore[reportUnknownArgumentType]
language=language, # type: ignore[reportUnknownArgumentType]
index=rule.contents.data.get("index", "*"), # type: ignore[reportUnknownArgumentType]
start_time=start_time,
end_time=end_time,
)
formatted_dsl.update(size=size or self.max_events)
# prep for searches: msearch for kql | async search for lucene | eql client search for eql
if language == "kuery":
multi_search_rules.append(rule)
multi_search.append({"index": index_str, "allow_no_indices": "true", "ignore_unavailable": "true"})
2022-09-06 15:53:47 -06:00
multi_search.append(formatted_dsl)
elif language == "lucene":
# wait for 0 to try and force async with no immediate results (not guaranteed)
result = async_client.submit(
body=formatted_dsl,
q=query, # type: ignore[reportUnknownArgumentType]
index=index_str,
allow_no_indices=True,
ignore_unavailable=True,
wait_for_completion_timeout=0,
)
if result["is_running"] is True:
async_searches.append((rule, result["id"]))
else:
survey_results[rule.id] = parse_unique_field_results(
rule_type, ["process.name"], result["response"]
)
elif language == "eql":
eql_body: dict[str, Any] = {
"index": index_str,
"params": {"ignore_unavailable": "true", "allow_no_indices": "true"},
"body": {"query": query, "filter": formatted_dsl["filter"]},
}
2022-09-06 15:53:47 -06:00
eql_searches.append((rule, eql_body))
# assemble search results
2022-09-06 15:53:47 -06:00
multi_search_results = self.client.msearch(searches=multi_search)
for index, result in enumerate(multi_search_results["responses"]):
try:
rule = multi_search_rules[index]
survey_results[rule.id] = parse_unique_field_results(
rule.contents.data.type,
rule.contents.data.unique_fields, # type: ignore[reportAttributeAccessIssje]
result,
)
except KeyError:
survey_results[multi_search_rules[index].id] = {"error_retrieving_results": True}
2022-09-06 15:53:47 -06:00
for entry in eql_searches:
rule, search_args = entry
try:
result = self.client.eql.search(**search_args)
survey_results[rule.id] = parse_unique_field_results(
rule.contents.data.type,
rule.contents.data.unique_fields, # type: ignore[reportAttributeAccessIssue]
result, # type: ignore[reportAttributeAccessIssue]
)
except (elasticsearch.NotFoundError, elasticsearch.RequestError) as e:
survey_results[rule.id] = {"error_retrieving_results": True, "error": e.info["error"]["reason"]}
2022-09-06 15:53:47 -06:00
for entry in async_searches:
rule: TOMLRule
rule, async_id = entry
result = async_client.get(id=async_id)["response"]
survey_results[rule.id] = parse_unique_field_results(rule.contents.data.type, ["process.name"], result)
return survey_results
def count(
self,
query: str,
language: str,
index: str | list[str],
start_time: str | None = None,
end_time: str | None = "now",
) -> Any:
"""Get a count of documents from elasticsearch."""
index_str, formatted_dsl, lucene_query = self._prep_query(
query=query,
language=language,
index=index,
start_time=start_time,
end_time=end_time,
)
# EQL API has no count endpoint
if language == "eql":
results = self.search(
query=query,
language=language,
index=index,
start_time=start_time,
end_time=end_time,
size=1000,
)
return len(results)
resp = self.client.count(
body=formatted_dsl,
index=index_str,
q=lucene_query,
allow_no_indices=True,
ignore_unavailable=True,
)
return resp["count"]
def count_from_rule(
self,
rules: RuleCollection,
start_time: str | None = None,
end_time: str | None = "now",
) -> dict[str, Any]:
"""Get a count of documents from elasticsearch using a rule."""
survey_results: dict[str, Any] = {}
2020-06-29 23:17:38 -06:00
2022-09-06 15:53:47 -06:00
for rule in rules.rules:
rule_results: dict[str, Any] = {"rule_id": rule.id, "name": rule.name}
2020-06-29 23:17:38 -06:00
if not rule.contents.data.get("query"):
continue
try:
rule_results["search_count"] = self.count(
query=rule.contents.data.query, # type: ignore[reportAttributeAccessIssue]
language=rule.contents.data.language, # type: ignore[reportAttributeAccessIssue]
index=rule.contents.data.get("index", "*"), # type: ignore[reportAttributeAccessIssue]
start_time=start_time,
end_time=end_time,
)
except (elasticsearch.NotFoundError, elasticsearch.RequestError):
rule_results["search_count"] = -1
survey_results[rule.id] = rule_results
return survey_results
def evaluate(rule: TOMLRule, events: list[Any], normalize_kql_keywords: bool = False) -> list[Any]:
"""Evaluate a query against events."""
evaluator = kql.get_evaluator(kql.parse(rule.query), normalize_kql_keywords=normalize_kql_keywords) # type: ignore[reportUnknownMemberType]
return list(filter(evaluator, events)) # type: ignore[reportUnknownMemberType]
def combine_sources(sources: list[Any]) -> list[Any]:
"""Combine lists of events from multiple sources."""
combined: list[Any] = []
for source in sources:
combined.extend(source.copy())
return event_sort(combined)
2025-03-05 12:35:57 +01:00
class CollectEventsWithDSL(CollectEvents):
"""Collect events from elasticsearch."""
2020-06-29 23:17:38 -06:00
@staticmethod
def _group_events_by_type(events: list[Any]) -> dict[str, list[Any]]:
2020-06-29 23:17:38 -06:00
"""Group events by agent.type."""
event_by_type: dict[str, list[Any]] = {}
2020-06-29 23:17:38 -06:00
for event in events:
event_by_type.setdefault(event["_source"]["agent"]["type"], []).append(event["_source"])
2020-06-29 23:17:38 -06:00
return event_by_type
def run(self, dsl: dict[str, Any], indexes: str | list[str], start_time: str) -> Events:
2020-06-29 23:17:38 -06:00
"""Collect the events."""
results = self.search(
dsl,
language="dsl",
index=indexes,
start_time=start_time,
end_time="now",
size=5000,
sort=[{"@timestamp": {"order": "asc"}}],
)
events = self._group_events_by_type(results)
2025-03-05 12:35:57 +01:00
return Events(events)
2020-06-29 23:17:38 -06:00
@root.command("normalize-data")
@click.argument("events-file", type=Path)
def normalize_data(events_file: Path) -> None:
"""Normalize Elasticsearch data timestamps and sort."""
file_name = events_file.name
content = events_file.read_text()
lines = content.splitlines()
events = Events({file_name: [json.loads(line) for line in lines]})
events.save(dump_dir=events_file.parent)
@root.group("es")
@add_params(*elasticsearch_options)
@click.pass_context
def es_group(ctx: click.Context, **kwargs: Any) -> None:
"""Commands for integrating with Elasticsearch."""
_ = ctx.ensure_object(dict) # type: ignore[reportUnknownVariableType]
# only initialize an es client if the subcommand is invoked without help (hacky)
if sys.argv[-1] in ctx.help_option_names:
click.echo("Elasticsearch client:")
click.echo(format_command_options(ctx))
else:
ctx.obj["es"] = get_elasticsearch_client(ctx=ctx, **kwargs)
@es_group.command("collect-events")
@click.argument("host-id")
@click.option("--query", "-q", help="KQL query to scope search")
@click.option("--index", "-i", multiple=True, help="Index(es) to search against (default: all indexes)")
@click.option("--rta-name", "-r", help="Name of RTA in order to save events directly to unit tests data directory")
@click.option("--rule-id", help="Updates rule mapping in rule-mapping.yaml file (requires --rta-name)")
@click.option("--view-events", is_flag=True, help="Print events after saving")
@click.pass_context
def collect_events( # noqa: PLR0913
ctx: click.Context,
host_id: str,
query: str,
index: list[str],
rta_name: str,
rule_id: str,
view_events: bool,
) -> Events:
2020-06-29 23:17:38 -06:00
"""Collect events from Elasticsearch."""
client: Elasticsearch = ctx.obj["es"]
dsl = kql.to_dsl(query) if query else MATCH_ALL # type: ignore[reportUnknownMemberType]
dsl["bool"].setdefault("filter", []).append( # type: ignore[reportUnknownMemberType]
{
"bool": {
"should": [{"match_phrase": {"host.id": host_id}}],
},
}
)
2020-06-29 23:17:38 -06:00
try:
2025-03-05 12:35:57 +01:00
collector = CollectEventsWithDSL(client)
start = time.time()
click.pause("Press any key once detonation is complete ...")
start_time = f"now-{round(time.time() - start) + 5}s"
events = collector.run(dsl, index or "*", start_time) # type: ignore[reportUnknownArgument]
events.save(rta_name=rta_name, host_id=host_id)
2020-06-29 23:17:38 -06:00
if rta_name and rule_id:
_ = events.evaluate_against_rule(rule_id)
2020-06-29 23:17:38 -06:00
if view_events and events.events:
events.echo_events(pager=True)
2020-06-29 23:17:38 -06:00
except AssertionError as e:
error_msg = "No events collected! Verify events are streaming and that the agent-hostname is correct"
raise_client_error(error_msg, e, ctx=ctx)
return events
@es_group.command("index-rules")
@click.option("--query", "-q", help="Optional KQL query to limit to specific rules")
@click.option("--from-file", "-f", type=click.File("r"), help="Load a previously saved uploadable bulk file")
@click.option("--save_files", "-s", is_flag=True, help="Optionally save the bulk request to a file")
@click.pass_context
def index_repo(ctx: click.Context, query: str, from_file: IO[Any] | None, save_files: bool) -> None:
"""Index rules based on KQL search results to an elasticsearch instance."""
from .main import generate_rules_index
es_client: Elasticsearch = ctx.obj["es"]
if from_file:
bulk_upload_docs = from_file.read()
# light validation only
try:
index_body = [json.loads(line) for line in bulk_upload_docs.splitlines()]
click.echo(f"{len([r for r in index_body if 'rule' in r])} rules included")
except json.JSONDecodeError:
raise_client_error(f"Improperly formatted bulk request file: {from_file.name}")
else:
bulk_upload_docs, _ = ctx.invoke(generate_rules_index, query=query, save_files=save_files)
_ = es_client.bulk(operations=bulk_upload_docs)