[FR] Add ML Jobs to Schemas and Unit Test for Validation (#3161)

* adding machine learning job id validation * Update rules/ml/credential_access_ml_auth_spike_in_logon_events_from_a_source_ip.toml * Update tests/test_all_rules.py * adding integration manifests and schemas from main * rebuilt manifests and schemas with lmd * fixed unit test linting * adding manifests and schemas for other analytic packages * updated manifests and schemas; adjusted unit test for verbosity * sorted imports (cherry picked from commit 3e212e2b74)
2023-10-12 10:51:12 -04:00
parent 788f2ce884
commit 0308e32ea0
4 changed files with 61 additions and 8 deletions
@@ -23,6 +23,7 @@ from . import ecs
 from .beats import flatten_ecs_schema
 from .misc import load_current_package_version
 from .utils import cached, get_etc_path, read_gzip, unzip
+from .schemas import definitions

 MANIFEST_FILE_PATH = Path(get_etc_path('integration-manifests.json.gz'))
 SCHEMA_FILE_PATH = Path(get_etc_path('integration-schemas.json.gz'))
@@ -137,12 +138,12 @@ def build_integrations_schemas(overwrite: bool, integration: str = None) -> None
            # Open the zip file
            with unzip(response.content) as zip_ref:
                for file in zip_ref.namelist():
+                    file_data_bytes = zip_ref.read(file)
                    # Check if the file is a match
                    if glob.fnmatch.fnmatch(file, '*/fields/*.yml'):
                        integration_name = Path(file).parent.parent.name
                        final_integration_schemas[package][version].setdefault(integration_name, {})
-                        file_data = zip_ref.read(file)
-                        schema_fields = yaml.safe_load(file_data)
+                        schema_fields = yaml.safe_load(file_data_bytes)

                        # Parse the schema and add to the integration_manifests
                        data = flatten_ecs_schema(schema_fields)
@@ -150,7 +151,14 @@ def build_integrations_schemas(overwrite: bool, integration: str = None) -> None

                        final_integration_schemas[package][version][integration_name].update(flat_data)

-                        del file_data
+                    # add machine learning jobs to the schema
+                    if integration in list(map(str.lower, definitions.MACHINE_LEARNING_PACKAGES)):
+                        if glob.fnmatch.fnmatch(file, '*/ml_module/*ml.json'):
+                            ml_module = json.loads(file_data_bytes)
+                            job_ids = [job['id'] for job in ml_module['attributes']['jobs']]
+                            final_integration_schemas[package][version]['jobs'] = job_ids
+
+                    del file_data_bytes

    # Write the final integration schemas to disk
    with gzip.open(SCHEMA_FILE_PATH, "w") as schema_file: