Files
sigma-rules/detection_rules/beats.py
T
2020-06-29 23:17:42 -06:00

161 lines
5.5 KiB
Python

# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License;
# you may not use this file except in compliance with the Elastic License.
"""ECS Schemas management."""
import os
import kql
import requests
import yaml
from .semver import Version
from .utils import unzip, load_etc_dump, save_etc_dump, get_etc_path
def download_latest_beats_schema():
"""Download additional schemas from ecs releases."""
url = 'https://api.github.com/repos/elastic/beats/releases'
releases = requests.get(url)
latest_release = max(releases.json(), key=lambda release: Version(release["tag_name"].lstrip("v")))
print(f"Downloading beats {latest_release['tag_name']}")
response = requests.get(latest_release['zipball_url'])
print(f"Downloaded {len(response.content) / 1024.0 / 1024.0:.2f} MB release.")
fs = {}
parsed = {}
with unzip(response.content) as archive:
base_directory = archive.namelist()[0]
for name in archive.namelist():
if os.path.basename(name) in ("fields.yml", "fields.common.yml", "config.yml"):
contents = archive.read(name)
# chop off the base directory name
key = name[len(base_directory):]
if key.startswith("x-pack"):
key = key[len("x-pack") + 1:]
try:
decoded = yaml.safe_load(contents)
except yaml.YAMLError:
print(f"Error loading {name}")
# create a hierarchical structure
parsed[key] = decoded
branch = fs
directory, base_name = os.path.split(key)
for limb in directory.split(os.path.sep):
branch = branch.setdefault("folders", {}).setdefault(limb, {})
branch.setdefault("files", {})[base_name] = decoded
# remove all non-beat directories
fs = {k: v for k, v in fs.get("folders", {}).items() if k.endswith("beat")}
print(f"Saving etc/beats_schema/{latest_release['tag_name']}.yml")
save_etc_dump(fs, "beats_schemas", latest_release["tag_name"] + ".yml")
def _flatten_schema(schema: list, prefix="") -> list:
if schema is None:
# sometimes we see `fields: null` in the yaml
return []
flattened = []
for s in schema:
if s.get("type") == "group":
flattened.extend(_flatten_schema(s["fields"], prefix=prefix + s["name"] + "."))
elif "fields" in s:
flattened.extend(_flatten_schema(s["fields"], prefix=prefix))
elif "type" in s:
s = s.copy()
s["name"] = prefix + s["name"]
flattened.append(s)
return flattened
def get_field_schema(base_directory, prefix="", include_common=False):
base_directory = base_directory.get("folders", {}).get("_meta", {}).get("files", {})
flattened = []
file_names = ("fields.yml", "fields.common.yml") if include_common else ("fields.yml", )
for name in file_names:
if name in base_directory:
flattened.extend(_flatten_schema(base_directory[name], prefix=prefix))
return flattened
def get_beats_schema(schema: dict, beat: str, module: str, *datasets: str):
if beat not in schema:
raise KeyError(f"Unknown beats module {beat}")
flattened = []
beat_dir = schema[beat]
flattened.extend(get_field_schema(beat_dir, include_common=True))
module_dir = beat_dir.get("folders", {}).get("module", {}).get("folders", {}).get(module, {})
flattened.extend(get_field_schema(module_dir, include_common=True))
# if we only have a module then we'll work with what we got
if not datasets:
datasets = [d for d in module_dir.get("folders", {}) if not d.startswith("_")]
for dataset in datasets:
# replace aws.s3 -> s3
if dataset.startswith(module + "."):
dataset = dataset[len(module) + 1:]
dataset_dir = module_dir.get("folders", {}).get(dataset, {})
flattened.extend(get_field_schema(dataset_dir, prefix=module + ".", include_common=True))
return {field["name"]: field for field in sorted(flattened, key=lambda f: f["name"])}
SCHEMA = None
def read_beats_schema():
global SCHEMA
if SCHEMA is None:
beats_schemas = os.listdir(get_etc_path("beats_schemas"))
latest = max(beats_schemas, key=lambda b: Version(b.lstrip("v")))
SCHEMA = load_etc_dump("beats_schemas", latest)
return SCHEMA
def get_schema_for_query(tree: kql.ast, beats: list) -> dict:
filtered = {}
modules = set()
datasets = set()
# extract out event.module and event.dataset from the query's AST
for node in tree:
if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.module"):
modules.update(child.value for child in node.value if isinstance(child, kql.ast.String))
if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.dataset"):
datasets.update(child.value for child in node.value if isinstance(child, kql.ast.String))
beats_schema = read_beats_schema()
for beat in beats:
# if no modules are specified then grab them all
# all_modules = list(beats_schema.get(beat, {}).get("folders", {}).get("module", {}).get("folders", {}))
# beat_modules = modules or all_modules
for module in modules:
filtered.update(get_beats_schema(beats_schema, beat, module, *datasets))
return filtered