diff --git a/detection_rules/etc/integration-manifests.json.gz b/detection_rules/etc/integration-manifests.json.gz index d7f09bdfc..9529c165b 100644 Binary files a/detection_rules/etc/integration-manifests.json.gz and b/detection_rules/etc/integration-manifests.json.gz differ diff --git a/detection_rules/etc/integration-schemas.json.gz b/detection_rules/etc/integration-schemas.json.gz index ffc241b03..37c60920a 100644 Binary files a/detection_rules/etc/integration-schemas.json.gz and b/detection_rules/etc/integration-schemas.json.gz differ diff --git a/detection_rules/integrations.py b/detection_rules/integrations.py index faa305175..2c264d051 100644 --- a/detection_rules/integrations.py +++ b/detection_rules/integrations.py @@ -23,6 +23,7 @@ from . import ecs from .beats import flatten_ecs_schema from .misc import load_current_package_version from .utils import cached, get_etc_path, read_gzip, unzip +from .schemas import definitions MANIFEST_FILE_PATH = Path(get_etc_path('integration-manifests.json.gz')) SCHEMA_FILE_PATH = Path(get_etc_path('integration-schemas.json.gz')) @@ -137,12 +138,12 @@ def build_integrations_schemas(overwrite: bool, integration: str = None) -> None # Open the zip file with unzip(response.content) as zip_ref: for file in zip_ref.namelist(): + file_data_bytes = zip_ref.read(file) # Check if the file is a match if glob.fnmatch.fnmatch(file, '*/fields/*.yml'): integration_name = Path(file).parent.parent.name final_integration_schemas[package][version].setdefault(integration_name, {}) - file_data = zip_ref.read(file) - schema_fields = yaml.safe_load(file_data) + schema_fields = yaml.safe_load(file_data_bytes) # Parse the schema and add to the integration_manifests data = flatten_ecs_schema(schema_fields) @@ -150,7 +151,14 @@ def build_integrations_schemas(overwrite: bool, integration: str = None) -> None final_integration_schemas[package][version][integration_name].update(flat_data) - del file_data + # add machine learning jobs to the schema + if integration in list(map(str.lower, definitions.MACHINE_LEARNING_PACKAGES)): + if glob.fnmatch.fnmatch(file, '*/ml_module/*ml.json'): + ml_module = json.loads(file_data_bytes) + job_ids = [job['id'] for job in ml_module['attributes']['jobs']] + final_integration_schemas[package][version]['jobs'] = job_ids + + del file_data_bytes # Write the final integration schemas to disk with gzip.open(SCHEMA_FILE_PATH, "w") as schema_file: diff --git a/tests/test_all_rules.py b/tests/test_all_rules.py index 67aeb151a..ec0a2c79e 100644 --- a/tests/test_all_rules.py +++ b/tests/test_all_rules.py @@ -10,24 +10,27 @@ import unittest import uuid import warnings from collections import defaultdict -from marshmallow import ValidationError from pathlib import Path import eql.ast +from marshmallow import ValidationError from semver import Version import kql from detection_rules import attack from detection_rules.beats import parse_beats_from_index -from detection_rules.integrations import load_integrations_schemas +from detection_rules.integrations import (find_latest_compatible_version, + load_integrations_manifests, + load_integrations_schemas) from detection_rules.misc import load_current_package_version from detection_rules.packaging import current_stack_version -from detection_rules.rule import (QueryRuleData, TOMLRuleContents, - load_integrations_manifests, QueryValidator) +from detection_rules.rule import (QueryRuleData, QueryValidator, + TOMLRuleContents) from detection_rules.rule_loader import FILE_PATTERN from detection_rules.rule_validators import EQLValidator, KQLValidator from detection_rules.schemas import definitions, get_stack_schemas -from detection_rules.utils import INTEGRATION_RULE_DIR, get_path, load_etc_dump, PatchedTemplate +from detection_rules.utils import (INTEGRATION_RULE_DIR, PatchedTemplate, + get_path, load_etc_dump) from detection_rules.version_lock import default_version_lock from rta import get_available_tests @@ -894,6 +897,48 @@ class TestIntegrationRules(BaseRuleTest): self.fail(f'The following ({len(failures)}) rules have a `min_stack_version` defined but missing comments:' f'\n{err_msg}') + def test_ml_integration_jobs_exist(self): + """Test that machine learning jobs exist in the integration.""" + failures = [] + + ml_integration_names = list(map(str.lower, definitions.MACHINE_LEARNING_PACKAGES)) + integration_schemas = load_integrations_schemas() + integration_manifests = load_integrations_manifests() + + for rule in self.all_rules: + if rule.contents.data.type == "machine_learning": + ml_integration_name = next((i for i in rule.contents.metadata.integration + if i in ml_integration_names), None) + if ml_integration_name: + if "machine_learning_job_id" not in dir(rule.contents.data): + failures.append(f'{self.rule_str(rule)} missing `machine_learning_job_id`') + else: + rule_job_id = rule.contents.data.machine_learning_job_id + ml_schema = integration_schemas.get(ml_integration_name) + min_version = Version.parse( + rule.contents.metadata.min_stack_version or load_current_package_version(), + optional_minor_and_patch=True + ) + latest_compat_ver = find_latest_compatible_version( + package=ml_integration_name, + integration="", + rule_stack_version=min_version, + packages_manifest=integration_manifests + ) + compat_integration_schema = ml_schema[latest_compat_ver[0]] + if rule_job_id not in compat_integration_schema['jobs']: + failures.append( + f'{self.rule_str(rule)} machine_learning_job_id `{rule_job_id}` not found ' + f'in version `{latest_compat_ver[0]}` of `{ml_integration_name}` integration. ' + f'existing jobs: {compat_integration_schema["jobs"]}' + ) + + if failures: + err_msg = '\n'.join(failures) + self.fail( + f'The following ({len(failures)}) rules are missing a valid `machine_learning_job_id`:\n{err_msg}' + ) + class TestRuleTiming(BaseRuleTest): """Test rule timing and timestamps."""