# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one # or more contributor license agreements. Licensed under the Elastic License; # you may not use this file except in compliance with the Elastic License. """ECS Schemas management.""" import os import kql import requests import yaml from .semver import Version from .utils import unzip, load_etc_dump, save_etc_dump, get_etc_path def download_latest_beats_schema(): """Download additional schemas from ecs releases.""" url = 'https://api.github.com/repos/elastic/beats/releases' releases = requests.get(url) latest_release = max(releases.json(), key=lambda release: Version(release["tag_name"].lstrip("v"))) print(f"Downloading beats {latest_release['tag_name']}") response = requests.get(latest_release['zipball_url']) print(f"Downloaded {len(response.content) / 1024.0 / 1024.0:.2f} MB release.") fs = {} parsed = {} with unzip(response.content) as archive: base_directory = archive.namelist()[0] for name in archive.namelist(): if os.path.basename(name) in ("fields.yml", "fields.common.yml", "config.yml"): contents = archive.read(name) # chop off the base directory name key = name[len(base_directory):] if key.startswith("x-pack"): key = key[len("x-pack") + 1:] try: decoded = yaml.safe_load(contents) except yaml.YAMLError: print(f"Error loading {name}") # create a hierarchical structure parsed[key] = decoded branch = fs directory, base_name = os.path.split(key) for limb in directory.split(os.path.sep): branch = branch.setdefault("folders", {}).setdefault(limb, {}) branch.setdefault("files", {})[base_name] = decoded # remove all non-beat directories fs = {k: v for k, v in fs.get("folders", {}).items() if k.endswith("beat")} print(f"Saving etc/beats_schema/{latest_release['tag_name']}.yml") save_etc_dump(fs, "beats_schemas", latest_release["tag_name"] + ".yml") def _flatten_schema(schema: list, prefix="") -> list: if schema is None: # sometimes we see `fields: null` in the yaml return [] flattened = [] for s in schema: if s.get("type") == "group": flattened.extend(_flatten_schema(s["fields"], prefix=prefix + s["name"] + ".")) elif "fields" in s: flattened.extend(_flatten_schema(s["fields"], prefix=prefix)) elif "type" in s: s = s.copy() s["name"] = prefix + s["name"] flattened.append(s) return flattened def get_field_schema(base_directory, prefix="", include_common=False): base_directory = base_directory.get("folders", {}).get("_meta", {}).get("files", {}) flattened = [] file_names = ("fields.yml", "fields.common.yml") if include_common else ("fields.yml", ) for name in file_names: if name in base_directory: flattened.extend(_flatten_schema(base_directory[name], prefix=prefix)) return flattened def get_beats_schema(schema: dict, beat: str, module: str, *datasets: str): if beat not in schema: raise KeyError(f"Unknown beats module {beat}") flattened = [] beat_dir = schema[beat] flattened.extend(get_field_schema(beat_dir, include_common=True)) module_dir = beat_dir.get("folders", {}).get("module", {}).get("folders", {}).get(module, {}) flattened.extend(get_field_schema(module_dir, include_common=True)) # if we only have a module then we'll work with what we got if not datasets: datasets = [d for d in module_dir.get("folders", {}) if not d.startswith("_")] for dataset in datasets: # replace aws.s3 -> s3 if dataset.startswith(module + "."): dataset = dataset[len(module) + 1:] dataset_dir = module_dir.get("folders", {}).get(dataset, {}) flattened.extend(get_field_schema(dataset_dir, prefix=module + ".", include_common=True)) return {field["name"]: field for field in sorted(flattened, key=lambda f: f["name"])} SCHEMA = None def read_beats_schema(): global SCHEMA if SCHEMA is None: beats_schemas = os.listdir(get_etc_path("beats_schemas")) latest = max(beats_schemas, key=lambda b: Version(b.lstrip("v"))) SCHEMA = load_etc_dump("beats_schemas", latest) return SCHEMA def get_schema_for_query(tree: kql.ast, beats: list) -> dict: filtered = {} modules = set() datasets = set() # extract out event.module and event.dataset from the query's AST for node in tree: if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.module"): modules.update(child.value for child in node.value if isinstance(child, kql.ast.String)) if isinstance(node, kql.ast.FieldComparison) and node.field == kql.ast.Field("event.dataset"): datasets.update(child.value for child in node.value if isinstance(child, kql.ast.String)) beats_schema = read_beats_schema() for beat in beats: # if no modules are specified then grab them all # all_modules = list(beats_schema.get(beat, {}).get("folders", {}).get("module", {}).get("folders", {})) # beat_modules = modules or all_modules for module in modules: filtered.update(get_beats_schema(beats_schema, beat, module, *datasets)) return filtered