[FR] Add Support for Multi-Fields and Validation in Rules (#2882)

2023-06-28 20:35:33 -04:00
parent a7e605a0e5
commit 73970eb2f2
5 changed files with 117 additions and 16 deletions
@@ -23,16 +23,19 @@ from typing import Dict, List, Optional, Tuple
 import click
 import pytoml
 import requests.exceptions
-from semver import Version
 import yaml
 from elasticsearch import Elasticsearch
 from eql.table import Table
+from semver import Version

 from kibana.connector import Kibana

 from . import attack, rule_loader, utils
+from .beats import (download_beats_schema, download_latest_beats_schema,
+                    refresh_main_schema)
 from .cli_utils import single_collection
 from .docs import IntegrationSecurityDocs, IntegrationSecurityDocsMDX
+from .ecs import download_endpoint_schemas, download_schemas
 from .endgame import EndgameSchemaManager
 from .eswrap import CollectEvents, add_range_to_dsl
 from .ghwrap import GithubClient, update_gist
@@ -47,7 +50,7 @@ from .misc import PYTHON_LICENSE, add_client, client_error
 from .packaging import (CURRENT_RELEASE_PATH, PACKAGE_FILE, RELEASE_DIR,
                        Package, current_stack_version)
 from .rule import (AnyRuleData, BaseRuleData, DeprecatedRule, QueryRuleData,
-                   ThreatMapping, TOMLRule, TOMLRuleContents, RuleTransform)
+                   RuleTransform, ThreatMapping, TOMLRule, TOMLRuleContents)
 from .rule_loader import RuleCollection, production_filter
 from .schemas import definitions, get_stack_versions
 from .utils import (dict_hash, get_etc_path, get_path, load_dump,
@@ -1279,17 +1282,57 @@ def update_rule_data_schemas():
        cls.save_schema()


-@schemas_group.command("generate-endgame")
+@schemas_group.command("generate")
@click.option("--token", required=True, prompt=get_github_token() is None, default=get_github_token(),
              help="GitHub token to use for the PR", hide_input=True)
-@click.option("--endgame-version", "-e", required=True, help="Tagged version from TBD. e.g., 1.9.0")
+@click.option("--schema", "-s", required=True, type=click.Choice(["endgame", "ecs", "beats", "endpoint"]),
+              help="Schema to generate")
+@click.option("--schema-version", "-sv", help="Tagged version from TBD. e.g., 1.9.0")
+@click.option("--endpoint-target", "-t", type=str, default="endpoint", help="Target endpoint schema")
@click.option("--overwrite", is_flag=True, help="Overwrite if versions exist")
-def generate_endgame_schema(token: str, endgame_version: str, overwrite: bool):
-    """Download Endgame-ECS mapping.json and generate flattend schema."""
+def generate_schema(token: str, schema: str, schema_version: str, endpoint_target: str, overwrite: bool):
+    """Download schemas and generate flattend schema."""
    github = GithubClient(token)
    client = github.authenticated_client
-    schema_manager = EndgameSchemaManager(client, endgame_version)
-    schema_manager.save_schemas(overwrite=overwrite)
+
+    if schema_version and not Version.parse(schema_version):
+        raise click.BadParameter(f"Invalid schema version: {schema_version}")
+
+    click.echo(f"Generating {schema} schema")
+    if schema == "endgame":
+        if not schema_version:
+            raise click.BadParameter("Schema version required")
+        schema_manager = EndgameSchemaManager(client, schema_version)
+        schema_manager.save_schemas(overwrite=overwrite)
+
+    # ecs, beats and endpoint schemas are refreshed during release
+    # these schemas do not require a schema version
+    if schema == "ecs":
+        download_schemas(refresh_all=True)
+    if schema == "beats":
+        if not schema_version:
+            download_latest_beats_schema()
+            refresh_main_schema()
+        else:
+            download_beats_schema(schema_version)
+
+    # endpoint package custom schemas can be downloaded
+    # this download requires a specific schema target
+    if schema == "endpoint":
+        repo = client.get_repo("elastic/endpoint-package")
+        contents = repo.get_contents("custom_schemas")
+        optional_endpoint_targets = [
+            Path(f.path).name.replace("custom_", "").replace(".yml", "")
+            for f in contents if f.name.endswith(".yml") or Path(f.path).name == endpoint_target
+        ]
+
+        if not endpoint_target:
+            raise click.BadParameter("Endpoint target required")
+        if endpoint_target not in optional_endpoint_targets:
+            raise click.BadParameter(f"""Invalid endpoint schema target: {endpoint_target}
+                                      \n Schema Options: {optional_endpoint_targets}""")
+        download_endpoint_schemas(endpoint_target)
+    click.echo(f"Done generating {schema} schema")


@dev_group.group('attack')
@@ -20,8 +20,10 @@ import yaml
 from .utils import (DateTimeEncoder, cached, get_etc_path, gzip_compress,
                    load_etc_dump, read_gzip, unzip)

-ETC_NAME = "ecs_schemas"
-ECS_SCHEMAS_DIR = get_etc_path(ETC_NAME)
+ECS_NAME = "ecs_schemas"
+ECS_SCHEMAS_DIR = get_etc_path(ECS_NAME)
+ENDPOINT_NAME = "endpoint_schemas"
+ENDPOINT_SCHEMAS_DIR = get_etc_path(ENDPOINT_NAME)


 def add_field(schema, name, info):
@@ -117,11 +119,16 @@ def get_eql_schema(version=None, index_patterns=None):
        field_type = schema_info.get('type', '')
        add_field(converted, field, convert_type(field_type))

+    # add non-ecs schema
    if index_patterns:
        for index_name in index_patterns:
            for k, v in flatten(get_index_schema(index_name)).items():
                add_field(converted, k, convert_type(v))

+    # add endpoint custom schema
+    for k, v in flatten(get_endpoint_schemas()).items():
+        add_field(converted, k, convert_type(v))
+
    return converted


@@ -198,6 +205,9 @@ def get_kql_schema(version=None, indexes=None, beat_schema=None) -> dict:
    for index_name in indexes:
        converted.update(**flatten(get_index_schema(index_name)))

+    # add endpoint custom schema
+    converted.update(**flatten(get_endpoint_schemas()))
+
    if isinstance(beat_schema, dict):
        converted = dict(flatten_multi_fields(beat_schema), **converted)

@@ -214,7 +224,7 @@ def download_schemas(refresh_master=True, refresh_all=False, verbose=True):
        version = Version.parse(release.get('tag_name', '').lstrip('v'))

        # we don't ever want beta
-        if not version or version < (1, 0, 1) or version in existing:
+        if not version or version < Version.parse("1.0.1") or version in existing:
            continue

        schema_dir = os.path.join(ECS_SCHEMAS_DIR, str(version))
@@ -236,7 +246,7 @@ def download_schemas(refresh_master=True, refresh_all=False, verbose=True):
                out_file = file_name.replace(".yml", ".json.gz")

                compressed = gzip_compress(json.dumps(contents, sort_keys=True, cls=DateTimeEncoder))
-                new_path = get_etc_path(ETC_NAME, str(version), out_file)
+                new_path = get_etc_path(ECS_NAME, str(version), out_file)
                with open(new_path, 'wb') as f:
                    f.write(compressed)

@@ -259,12 +269,55 @@ def download_schemas(refresh_master=True, refresh_all=False, verbose=True):
            shutil.rmtree(m, ignore_errors=True)

        master_dir = "master_{}".format(master_ver)
-        os.makedirs(get_etc_path(ETC_NAME, master_dir), exist_ok=True)
+        os.makedirs(get_etc_path(ECS_NAME, master_dir), exist_ok=True)

        compressed = gzip_compress(json.dumps(master_schema, sort_keys=True, cls=DateTimeEncoder))
-        new_path = get_etc_path(ETC_NAME, master_dir, "ecs_flat.json.gz")
+        new_path = get_etc_path(ECS_NAME, master_dir, "ecs_flat.json.gz")
        with open(new_path, 'wb') as f:
            f.write(compressed)

        if verbose:
            print('Saved files to {}: \n\t- {}'.format(master_dir, 'ecs_flat.json.gz'))
+
+
+def download_endpoint_schemas(target: str, overwrite: bool = True) -> None:
+    """Download endpoint custom schemas."""
+
+    # location of custom schema YAML files
+    url = "https://raw.githubusercontent.com/elastic/endpoint-package/main/custom_schemas"
+    r = requests.get(f"{url}/custom_{target}.yml")
+    if r.status_code == 404:
+        r = requests.get(f"{url}/{target}/custom_{target}.yaml")
+    r.raise_for_status()
+    schema = yaml.safe_load(r.text)[0]
+    root_name = schema["name"]
+    fields = schema["fields"]
+    flattened = {}
+
+    # iterate over nested fields and flatten them
+    for f in fields:
+        if 'multi_fields' in f:
+            for mf in f['multi_fields']:
+                flattened[f"{root_name}.{f['name']}.{mf['name']}"] = mf['type']
+        else:
+            flattened[f"{root_name}.{f['name']}"] = f['type']
+
+    # save schema to disk
+    Path(ENDPOINT_SCHEMAS_DIR).mkdir(parents=True, exist_ok=True)
+    compressed = gzip_compress(json.dumps(flattened, sort_keys=True, cls=DateTimeEncoder))
+    new_path = Path(ENDPOINT_SCHEMAS_DIR) / f"endpoint_{target}.json.gz"
+    if overwrite:
+        shutil.rmtree(new_path, ignore_errors=True)
+    with open(new_path, 'wb') as f:
+        f.write(compressed)
+    print(f"Saved endpoint schema to {new_path}")
+
+
+@cached
+def get_endpoint_schemas() -> dict:
+    """Load endpoint schemas."""
+    schema = {}
+    existing = glob.glob(os.path.join(ENDPOINT_SCHEMAS_DIR, '*.json.gz'))
+    for f in existing:
+        schema.update(json.loads(read_gzip(f)))
+    return schema
@@ -76,8 +76,7 @@
    "process.Ext.relative_file_name_modify_time": "double",
    "process.Ext.relative_file_creation_time": "double",
    "Target.process.name": "keyword",
-    "process.Ext.api.name": "keyword",
-    "process.name.caseless": "keyword"
+    "process.Ext.api.name": "keyword"
  },
  "logs-windows.*": {
    "powershell.file.script_block_text": "text"
@@ -113,6 +113,9 @@ class KQLValidator(QueryValidator):
            # add non-ecs-schema fields for edge cases not added to the integration
            for index_name in data.index:
                integration_schema.update(**ecs.flatten(ecs.get_index_schema(index_name)))
+
+            # add endpoint schema fields for multi-line fields
+            integration_schema.update(**ecs.flatten(ecs.get_endpoint_schemas()))
            combined_schema.update(**integration_schema)

            try:
@@ -243,6 +246,9 @@ class EQLValidator(QueryValidator):
            # add non-ecs-schema fields for edge cases not added to the integration
            for index_name in data.index:
                integration_schema.update(**ecs.flatten(ecs.get_index_schema(index_name)))
+
+            # add endpoint schema fields for multi-line fields
+            integration_schema.update(**ecs.flatten(ecs.get_endpoint_schemas()))
            combined_schema.update(**integration_schema)

            eql_schema = ecs.KqlSchema2Eql(integration_schema)