Files
sigma-rules/detection_rules/beats.py
T

303 lines
12 KiB
Python
Raw Normal View History

2020-06-29 23:17:38 -06:00
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2021-03-03 22:12:11 -09:00
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
2020-06-29 23:17:38 -06:00
"""ECS Schemas management."""
import json
2020-06-29 23:17:38 -06:00
import os
import re
from pathlib import Path
from typing import Any
2020-06-29 23:17:38 -06:00
import eql # type: ignore[reportMissingTypeStubs]
import kql # type: ignore[reportMissingTypeStubs]
2020-06-29 23:17:38 -06:00
import requests
import yaml
from semver import Version
2020-06-29 23:17:38 -06:00
from .utils import DateTimeEncoder, cached, get_etc_path, gzip_compress, read_gzip, unzip
2020-06-29 23:17:38 -06:00
def _decompress_and_save_schema(url: str, release_name: str) -> None:
print(f"Downloading beats {release_name}")
response = requests.get(url, timeout=30)
2020-06-29 23:17:38 -06:00
print(f"Downloaded {len(response.content) / 1024.0 / 1024.0:.2f} MB release.")
fs: dict[str, Any] = {}
2020-06-29 23:17:38 -06:00
with unzip(response.content) as archive:
base_directory = archive.namelist()[0]
for name in archive.namelist():
path = Path(name)
if path.name in ("fields.yml", "fields.common.yml", "config.yml"):
2020-06-29 23:17:38 -06:00
contents = archive.read(name)
# chop off the base directory name
key = name[len(base_directory) :]
2020-06-29 23:17:38 -06:00
if key.startswith("x-pack"):
key = key[len("x-pack") + 1 :]
2020-06-29 23:17:38 -06:00
try:
decoded = yaml.safe_load(contents)
except yaml.YAMLError as e:
2020-06-29 23:17:38 -06:00
print(f"Error loading {name}")
raise ValueError(f"Error loading {name}") from e
2020-06-29 23:17:38 -06:00
# create a hierarchical structure
branch = fs
directory, base_name = os.path.split(key)
for limb in directory.split(os.path.sep):
branch = branch.setdefault("folders", {}).setdefault(limb, {})
branch.setdefault("files", {})[base_name] = decoded
# remove all non-beat directories
fs = {k: v for k, v in fs.get("folders", {}).items() if k.endswith("beat")}
2022-05-02 10:11:21 -04:00
print(f"Saving detection_rules/etc/beats_schema/{release_name}.json")
compressed = gzip_compress(json.dumps(fs, sort_keys=True, cls=DateTimeEncoder))
path = get_etc_path(["beats_schemas", release_name + ".json.gz"])
with path.open("wb") as f:
_ = f.write(compressed)
2020-06-29 23:17:38 -06:00
def download_beats_schema(version: str) -> None:
2021-04-22 12:49:06 -05:00
"""Download a beats schema by version."""
url = "https://api.github.com/repos/elastic/beats/releases"
releases = requests.get(url, timeout=30)
2021-04-22 12:49:06 -05:00
version = f"v{version.lstrip('v')}"
2021-04-22 12:49:06 -05:00
beats_release = None
for release in releases.json():
if release["tag_name"] == version:
2021-04-22 12:49:06 -05:00
beats_release = release
break
if not beats_release:
print(f"beats release {version} not found!")
2021-04-22 12:49:06 -05:00
return
beats_url = beats_release["zipball_url"]
name = beats_release["tag_name"]
2021-04-22 12:49:06 -05:00
_decompress_and_save_schema(beats_url, name)
def download_latest_beats_schema() -> None:
"""Download additional schemas from beats releases."""
url = "https://api.github.com/repos/elastic/beats/releases"
releases = requests.get(url, timeout=30)
latest_release = max(releases.json(), key=lambda release: Version.parse(release["tag_name"].lstrip("v")))
2021-04-22 12:49:06 -05:00
download_beats_schema(latest_release["tag_name"])
def refresh_main_schema() -> None:
"""Download and refresh beats schema from main."""
_decompress_and_save_schema("https://github.com/elastic/beats/archive/main.zip", "main")
def _flatten_schema(schema: list[dict[str, Any]] | None, prefix: str = "") -> list[dict[str, Any]]:
2020-06-29 23:17:38 -06:00
if schema is None:
# sometimes we see `fields: null` in the yaml
return []
flattened: list[dict[str, Any]] = []
2020-06-29 23:17:38 -06:00
for s in schema:
if s.get("type") == "group":
nested_prefix = prefix + s["name"] + "."
2021-08-15 06:29:10 +02:00
# beats is complicated. it seems like we would expect a zoom.webhook.*, for the zoom.webhook dataset,
# but instead it's just at zoom.* directly.
#
# we have what looks like zoom.zoom.*, but should actually just be zoom.*.
# this is one quick heuristic to determine if a submodule nests fields at the parent.
# it's probably not perfect, but we can fix other bugs as we run into them later
if len(schema) == 1 and nested_prefix.startswith(prefix + prefix):
nested_prefix = s["name"] + "."
if "field" in s:
# integrations sometimes have a group with a single field
flattened.extend(_flatten_schema(s["field"], prefix=nested_prefix))
continue
if "fields" not in s:
# integrations sometimes have a group with no fields
continue
flattened.extend(_flatten_schema(s["fields"], prefix=nested_prefix))
2020-06-29 23:17:38 -06:00
elif "fields" in s:
flattened.extend(_flatten_schema(s["fields"], prefix=prefix))
elif "name" in s:
_s = s.copy()
# type is implicitly keyword if not defined
# example: https://github.com/elastic/beats/blob/main/packetbeat/_meta/fields.common.yml#L7-L12
_s.setdefault("type", "keyword")
_s["name"] = prefix + s["name"]
flattened.append(_s)
2020-06-29 23:17:38 -06:00
return flattened
def flatten_ecs_schema(schema: list[dict[str, Any]]) -> list[dict[str, Any]]:
return _flatten_schema(schema)
def get_field_schema(
base_directory: dict[str, Any],
prefix: str = "",
include_common: bool = False,
) -> list[dict[str, Any]]:
2020-06-29 23:17:38 -06:00
base_directory = base_directory.get("folders", {}).get("_meta", {}).get("files", {})
flattened: list[dict[str, Any]] = []
2020-06-29 23:17:38 -06:00
file_names = ("fields.yml", "fields.common.yml") if include_common else ("fields.yml",)
2020-06-29 23:17:38 -06:00
for name in file_names:
if name in base_directory:
flattened.extend(_flatten_schema(base_directory[name], prefix=prefix))
return flattened
def get_beat_root_schema(schema: dict[str, Any], beat: str) -> dict[str, Any]:
2020-06-29 23:17:38 -06:00
if beat not in schema:
raise KeyError(f"Unknown beats module {beat}")
beat_dir = schema[beat]
flattened = get_field_schema(beat_dir, include_common=True)
return {field["name"]: field for field in sorted(flattened, key=lambda f: f["name"])}
def get_beats_sub_schema(schema: dict[str, Any], beat: str, module: str, *datasets: str) -> dict[str, Any]:
if beat not in schema:
raise KeyError(f"Unknown beats module {beat}")
2020-06-29 23:17:38 -06:00
flattened: list[dict[str, Any]] = []
beat_dir = schema[beat]
2020-06-29 23:17:38 -06:00
module_dir = beat_dir.get("folders", {}).get("module", {}).get("folders", {}).get(module, {})
# if we only have a module then we'll work with what we got
all_datasets = datasets if datasets else [d for d in module_dir.get("folders", {}) if not d.startswith("_")]
2020-06-29 23:17:38 -06:00
for _dataset in all_datasets:
2020-06-29 23:17:38 -06:00
# replace aws.s3 -> s3
dataset = _dataset[len(module) + 1 :] if _dataset.startswith(module + ".") else _dataset
2020-06-29 23:17:38 -06:00
dataset_dir = module_dir.get("folders", {}).get(dataset, {})
flattened.extend(get_field_schema(dataset_dir, prefix=module + ".", include_common=True))
# we also need to capture (beta?) fields which are directly within the module _meta.files.fields
flattened.extend(get_field_schema(module_dir, include_common=True))
2020-06-29 23:17:38 -06:00
return {field["name"]: field for field in sorted(flattened, key=lambda f: f["name"])}
2020-09-16 08:36:48 -06:00
@cached
def get_versions() -> list[Version]:
versions: list[Version] = []
for filename in os.listdir(get_etc_path(["beats_schemas"])): # noqa: PTH208
version_match = re.match(r"v(.+)\.json\.gz", filename)
if version_match:
versions.append(Version.parse(version_match.groups()[0]))
return versions
@cached
def get_max_version() -> str:
return str(max(get_versions()))
@cached
def read_beats_schema(version: str | None = None) -> dict[str, Any]:
if version and version.lower() == "main":
path = get_etc_path(["beats_schemas", "main.json.gz"])
return json.loads(read_gzip(path))
ver = Version.parse(version) if version else None
beats_schemas = get_versions()
if ver and ver not in beats_schemas:
raise ValueError(f"Unknown beats schema: {ver}")
version = version or get_max_version()
2020-06-29 23:17:38 -06:00
return json.loads(read_gzip(get_etc_path(["beats_schemas", f"v{version}.json.gz"])))
2020-06-29 23:17:38 -06:00
def get_schema_from_datasets(
beats: list[str],
modules: set[str],
datasets: set[str],
version: str | None = None,
) -> dict[str, Any]:
filtered: dict[str, Any] = {}
beats_schema = read_beats_schema(version=version)
2020-06-29 23:17:38 -06:00
# infer the module if only a dataset are defined
if not modules:
modules.update(ds.split(".")[0] for ds in datasets if "." in ds)
2020-06-29 23:17:38 -06:00
for beat in beats:
# if no modules are specified then grab them all
filtered.update(get_beat_root_schema(beats_schema, beat))
2020-06-29 23:17:38 -06:00
for module in modules:
filtered.update(get_beats_sub_schema(beats_schema, beat, module, *datasets))
2020-06-29 23:17:38 -06:00
return filtered
2020-09-16 08:36:48 -06:00
def get_datasets_and_modules(tree: eql.ast.BaseNode | kql.ast.BaseNode) -> tuple[set[str], set[str]]:
"""Get datasets and modules from an EQL or KQL AST."""
modules: set[str] = set()
datasets: set[str] = set()
2020-09-16 08:36:48 -06:00
# extract out event.module and event.dataset from the query's AST
for node in tree: # type: ignore[reportUnknownVariableType]
if (
isinstance(node, eql.ast.Comparison)
and node.comparator == node.EQ
and isinstance(node.right, eql.ast.String)
):
2020-09-16 08:36:48 -06:00
if node.left == eql.ast.Field("event", ["module"]):
modules.add(node.right.render()) # type: ignore[reportUnknownMemberType]
2020-09-16 08:36:48 -06:00
elif node.left == eql.ast.Field("event", ["dataset"]):
datasets.add(node.right.render()) # type: ignore[reportUnknownMemberType]
2020-09-16 08:36:48 -06:00
elif isinstance(node, eql.ast.InSet):
if node.expression == eql.ast.Field("event", ["module"]):
modules.update(node.get_literals()) # type: ignore[reportUnknownMemberType]
2020-09-16 08:36:48 -06:00
elif node.expression == eql.ast.Field("event", ["dataset"]):
datasets.update(node.get_literals()) # type: ignore[reportUnknownMemberType]
elif isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.module"): # type: ignore[reportUnknownMemberType]
modules.update(child.value for child in node.value if isinstance(child, kql.ast.String)) # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
elif isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.dataset"): # type: ignore[reportUnknownMemberType]
datasets.update(child.value for child in node.value if isinstance(child, kql.ast.String)) # type: ignore[reportUnknownMemberType, reportUnknownVariableType]
2020-09-16 08:36:48 -06:00
return datasets, modules
2020-09-16 08:36:48 -06:00
def get_schema_from_kql(tree: kql.ast.BaseNode, beats: list[str], version: str | None = None) -> dict[str, Any]:
"""Get a schema based on datasets and modules in an KQL AST."""
datasets, modules = get_datasets_and_modules(tree)
return get_schema_from_datasets(beats, modules, datasets, version=version)
def parse_beats_from_index(indexes: list[str] | None) -> list[str]:
2024-08-06 18:07:12 -04:00
"""Parse beats schema types from index."""
indexes = indexes or []
beat_types: list[str] = []
2024-11-13 11:17:08 +05:30
# Need to split on : or :: to support cross-cluster search
2024-08-06 18:07:12 -04:00
# e.g. mycluster:logs-* -> logs-*
for index in indexes:
if "beat-*" in index:
index_parts = index.replace("::", ":").split(":", 1)
2024-08-06 18:07:12 -04:00
last_part = index_parts[-1]
beat_type = last_part.split("-")[0]
beat_types.append(beat_type)
return beat_types