sigma-rules/detection_rules/index_mappings.py

# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.

"""Validation logic for rules containing queries."""

import re
import time
from collections.abc import Callable
from copy import deepcopy
from typing import Any

from elastic_transport import ObjectApiResponse
from elasticsearch import Elasticsearch  # type: ignore[reportMissingTypeStubs]
from elasticsearch.exceptions import BadRequestError
from semver import Version

from . import ecs, integrations, misc, utils
from .config import load_current_package_version
from .esql import EventDataset
from .esql_errors import (
    EsqlKibanaBaseError,
    EsqlSchemaError,
    EsqlSyntaxError,
    EsqlTypeMismatchError,
    EsqlUnknownIndexError,
    EsqlUnsupportedTypeError,
    cleanup_empty_indices,
)
from .integrations import (
    load_integrations_manifests,
    load_integrations_schemas,
)
from .rule import RuleMeta
from .schemas import get_stack_schemas
from .schemas.definitions import HTTP_STATUS_BAD_REQUEST
from .utils import combine_dicts


def delete_nested_key_from_dict(d: dict[str, Any], compound_key: str) -> None:
    """Delete a nested key from a dictionary."""
    keys = compound_key.split(".")
    for key in keys[:-1]:
        if key in d and isinstance(d[key], dict):
            d = d[key]  # type: ignore[reportUnknownVariableType]
        else:
            return
    d.pop(keys[-1], None)


def flat_schema_to_index_mapping(flat_schema: dict[str, str]) -> dict[str, Any]:
    """
    Convert dicts with flat JSON paths and values into a nested mapping with
    intermediary `properties`, `fields` and `type` fields.
    """

    # Sorting here ensures that 'a.b' processed before 'a.b.c', allowing us to correctly
    # detect and handle multi-fields.
    sorted_items = sorted(flat_schema.items())
    result = {}

    for field_path, field_type in sorted_items:
        parts = field_path.split(".")
        current_level = result

        for part in parts[:-1]:
            node = current_level.setdefault(part, {})  # type: ignore[reportUnknownVariableType]

            if "type" in node and node["type"] not in ("nested", "object"):
                current_level = node.setdefault("fields", {})  # type: ignore[reportUnknownVariableType]
            else:
                current_level = node.setdefault("properties", {})  # type: ignore[reportUnknownVariableType]

        leaf_key = parts[-1]
        current_level[leaf_key] = {"type": field_type}

        # add `scaling_factor` field missing in the schema
        # https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/number#scaled-float-params
        if field_type == "scaled_float":
            current_level[leaf_key]["scaling_factor"] = 1000

        # add `path` field for `alias` fields, set to a dummy value
        if field_type == "alias":
            current_level[leaf_key]["path"] = "@timestamp"

    return result  # type: ignore[reportUnknownVariableType]


def get_rule_integrations(metadata: RuleMeta) -> list[str]:
    """Retrieve rule integrations from metadata."""
    if metadata.integration:
        rule_integrations: list[str] = (
            metadata.integration if isinstance(metadata.integration, list) else [metadata.integration]
        )
    else:
        rule_integrations: list[str] = []
    return rule_integrations


def create_index_with_index_mapping(
    elastic_client: Elasticsearch, index_name: str, mappings: dict[str, Any]
) -> ObjectApiResponse[Any] | None:
    """Create an index with the specified mappings and settings to support large number of fields and nested objects."""
    try:
        return elastic_client.indices.create(
            index=index_name,
            mappings={"properties": mappings},
            settings={
                "index.mapping.total_fields.limit": 10000,
                "index.mapping.nested_fields.limit": 500,
                "index.mapping.nested_objects.limit": 10000,
            },
        )
    except BadRequestError as e:
        error_message = str(e)
        if (
            e.status_code == HTTP_STATUS_BAD_REQUEST
            and "validation_exception" in error_message
            and "Validation Failed: 1: this action would add [2] shards" in error_message
        ):
            cleanup_empty_indices(elastic_client)
            try:
                return elastic_client.indices.create(
                    index=index_name,
                    mappings={"properties": mappings},
                    settings={
                        "index.mapping.total_fields.limit": 10000,
                        "index.mapping.nested_fields.limit": 500,
                        "index.mapping.nested_objects.limit": 10000,
                    },
                )
            except BadRequestError as retry_error:
                raise EsqlSchemaError(str(retry_error), elastic_client) from retry_error
        raise EsqlSchemaError(error_message, elastic_client) from e


def get_existing_mappings(elastic_client: Elasticsearch, indices: list[str]) -> tuple[dict[str, Any], dict[str, Any]]:
    """Retrieve mappings for all matching existing index templates."""
    existing_mappings: dict[str, Any] = {}
    index_lookup: dict[str, Any] = {}
    for index in indices:
        index_tmpl_mappings = get_simulated_index_template_mappings(elastic_client, index)
        index_lookup[index] = index_tmpl_mappings
        combine_dicts(existing_mappings, index_tmpl_mappings)
    return existing_mappings, index_lookup


def get_simulated_index_template_mappings(elastic_client: Elasticsearch, name: str) -> dict[str, Any]:
    """
    Return the mappings from the index configuration that would be applied
    to the specified index from an existing index template

    https://elasticsearch-py.readthedocs.io/en/stable/api/indices.html#elasticsearch.client.IndicesClient.simulate_index_template
    """
    template = elastic_client.indices.simulate_index_template(name=name)
    if not template:
        return {}
    return template["template"]["mappings"]["properties"]


def prune_mappings_of_unsupported_types(
    debug_str_data_source: str, stream_mappings: dict[str, Any], log: Callable[[str], None]
) -> dict[str, Any]:
    """Prune fields with unsupported types (ES|QL) from the provided mappings."""
    nested_multifields = find_nested_multifields(stream_mappings)
    for field in nested_multifields:
        parts = str(field).split(".fields.")[0].split(".")
        base_name = ".properties.".join(parts)
        field_name = f"{base_name}.fields"
        log(
            f"Warning: Nested multi-field `{field}` found in `{debug_str_data_source}`. "
            f"Removing parent field from schema for ES|QL validation."
        )
        delete_nested_key_from_dict(stream_mappings, field_name)
    nested_flattened_fields = find_flattened_fields_with_subfields(stream_mappings)
    for field in nested_flattened_fields:
        # Remove both .fields and .properties entries for flattened fields
        # .properties entries can occur when being merged with non-ecs or custom schemas
        parts = str(field).split(".fields.")[0].split(".")
        base_name = ".properties.".join(parts)
        field_name = f"{base_name}.fields"
        property_name = f"{base_name}.properties"
        log(
            f"Warning: flattened field `{field}` found in `{debug_str_data_source}` with sub fields. "
            f"Removing parent field from schema for ES|QL validation."
        )
        delete_nested_key_from_dict(stream_mappings, field_name)
        delete_nested_key_from_dict(stream_mappings, property_name)
    return stream_mappings


def prepare_integration_mappings(  # noqa: PLR0913
    rule_integrations: list[str],
    event_dataset_integrations: list[EventDataset],
    package_manifests: Any,
    integration_schemas: Any,
    stack_version: str,
    log: Callable[[str], None],
) -> tuple[dict[str, Any], dict[str, Any]]:
    """Prepare integration mappings for the given rule integrations."""
    integration_mappings: dict[str, Any] = {}
    index_lookup: dict[str, Any] = {}
    dataset_restriction: dict[str, list[str]] = {}

    # Process restrictions, note we need this for loops to be separate
    for event_dataset in event_dataset_integrations:
        # Ensure the integration is in rule_integrations
        if event_dataset.package not in rule_integrations:
            dataset_restriction.setdefault(event_dataset.package, []).append(event_dataset.integration)
    for event_dataset in event_dataset_integrations:
        if event_dataset.package not in rule_integrations:
            rule_integrations.append(event_dataset.package)

    for integration in rule_integrations:
        package = integration
        package_version, _ = integrations.find_latest_compatible_version(
            package,
            "",
            Version.parse(stack_version),
            package_manifests,
        )
        package_schema = integration_schemas[package][package_version]

        # Apply dataset restrictions if any
        if integration in dataset_restriction:
            allowed_keys = dataset_restriction[integration]
            package_schema = {key: value for key, value in package_schema.items() if key in allowed_keys}

        for stream in package_schema:
            flat_schema = package_schema[stream]
            stream_mappings = flat_schema_to_index_mapping(flat_schema)
            stream_mappings = prune_mappings_of_unsupported_types(f"{integration}-{stream}", stream_mappings, log)
            utils.combine_dicts(integration_mappings, deepcopy(stream_mappings))
            index_lookup[f"{integration}-{stream}"] = stream_mappings

    return integration_mappings, index_lookup


def get_index_to_package_lookup(indices: list[str], index_lookup: dict[str, Any]) -> dict[str, Any]:
    """Get a lookup of index patterns to package names for the provided indices."""
    index_lookup_indices: dict[str, Any] = {}
    for key in index_lookup:
        if key not in indices:
            # Add logs-<key>* and logs-<key>-*
            transformed_key_star = f"logs-{key.replace('-', '.')}*"
            transformed_key_dash = f"logs-{key.replace('-', '.')}-*"
            if "logs-endpoint." in transformed_key_star or "logs-endpoint." in transformed_key_dash:
                transformed_key_star = transformed_key_star.replace("logs-endpoint.", "logs-endpoint.events.")
                transformed_key_dash = transformed_key_dash.replace("logs-endpoint.", "logs-endpoint.events.")
            index_lookup_indices[transformed_key_star] = key.replace("-", ".")
            index_lookup_indices[transformed_key_dash] = key.replace("-", ".")

    return index_lookup_indices


def get_filtered_index_schema(  # noqa: PLR0913
    indices: list[str],
    index_lookup: dict[str, Any],
    ecs_schema: dict[str, Any],
    non_ecs_mapping: dict[str, Any],
    custom_mapping: dict[str, Any],
    log: Callable[[str], None],
) -> tuple[dict[str, Any], dict[str, Any]]:
    """Check if the provided indices are known based on the integration format. Returns the combined schema."""

    non_ecs_indices = ecs.get_non_ecs_schema()
    custom_indices = ecs.get_custom_schemas()

    # Assumes valid index format is logs-<integration>.<package>* or logs-<integration>.<package>-*
    filtered_keys = {"logs-" + key.replace("-", ".") + "*" for key in index_lookup if key not in indices}
    filtered_keys.update({"logs-" + key.replace("-", ".") + "-*" for key in index_lookup if key not in indices})
    # Replace "logs-endpoint." with "logs-endpoint.events."
    filtered_keys = {
        key.replace("logs-endpoint.", "logs-endpoint.events.") if "logs-endpoint." in key else key
        for key in filtered_keys
    }
    filtered_keys.update(non_ecs_indices.keys())
    filtered_keys.update(custom_indices.keys())
    filtered_keys.add("logs-endpoint.alerts-*")

    matches: list[str] = []
    for index in indices:
        pattern = re.compile(index.replace(".", r"\.").replace("*", ".*").rstrip("-"))
        matches.extend([key for key in filtered_keys if pattern.fullmatch(key)])

    if not matches:
        raise EsqlUnknownIndexError(
            f"Unknown index pattern(s): {', '.join(indices)}. Known patterns: {', '.join(filtered_keys)}"
        )

    if "logs-endpoint.alerts-*" in matches and "logs-endpoint.events.alerts-*" not in matches:
        matches.append("logs-endpoint.events.alerts-*")

    # Now that we have the matched indices, we need to filter the index lookup to only include those indices
    filtered_index_lookup = {
        "logs-" + key.replace("-", ".") + "*": value for key, value in index_lookup.items() if key not in indices
    }
    filtered_index_lookup.update(
        {"logs-" + key.replace("-", ".") + "-*": value for key, value in index_lookup.items() if key not in indices}
    )
    filtered_index_lookup = {
        key.replace("logs-endpoint.", "logs-endpoint.events."): value for key, value in filtered_index_lookup.items()
    }

    # Reduce the combined mappings to only the matched indices (local schema validation source of truth)
    # Custom and non-ecs mappings are filtered before being sent to this function in prepare mappings
    combined_mappings: dict[str, Any] = {}
    utils.combine_dicts(combined_mappings, deepcopy(ecs_schema))
    for match in matches:
        base = filtered_index_lookup.get(match, {})
        # Update filtered index with non-ecs and custom mappings
        # Need to use a merge here to not overwrite existing fields
        utils.combine_dicts(base, deepcopy(non_ecs_mapping.get(match, {})))
        utils.combine_dicts(base, deepcopy(custom_mapping.get(match, {})))
        filtered_index_lookup[match] = prune_mappings_of_unsupported_types(match, base, log)
        utils.combine_dicts(combined_mappings, deepcopy(base))

    # Reduce the index lookup to only the matched indices (remote/Kibana schema validation source of truth)
    filtered_index_mapping: dict[str, Any] = {}
    index_lookup_indices = get_index_to_package_lookup(indices, index_lookup)
    for match in matches:
        if match in index_lookup_indices:
            index_name = index_lookup_indices[match].replace(".", "-")
            filtered_index_mapping[index_name] = index_lookup[index_name]
        else:
            filtered_index_mapping[match] = filtered_index_lookup.get(match, {})

    return combined_mappings, filtered_index_mapping


def create_remote_indices(
    elastic_client: Elasticsearch,
    existing_mappings: dict[str, Any],
    index_lookup: dict[str, Any],
    log: Callable[[str], None],
) -> str:
    """Create remote indices for validation and return the index string."""

    suffix = str(int(time.time() * 1000))
    test_index = f"rule-test-index-{suffix}"
    response = create_index_with_index_mapping(elastic_client, test_index, existing_mappings)
    log(f"Index `{test_index}` created: {response}")
    full_index_str = test_index

    # create all integration indices
    for index, properties in index_lookup.items():
        ind_index_str = f"test-{index.rstrip('*')}{suffix}"
        response = create_index_with_index_mapping(elastic_client, ind_index_str, properties)
        log(f"Index `{ind_index_str}` created: {response}")
        full_index_str = f"{full_index_str}, {ind_index_str}"

    return full_index_str


def execute_query_against_indices(
    elastic_client: Elasticsearch,
    query: str,
    test_index_str: str,
    log: Callable[[str], None],
    delete_indices: bool = True,
) -> tuple[list[Any], ObjectApiResponse[Any]]:
    """Execute the ESQL query against the test indices on a remote Stack and return the columns."""
    try:
        log(f"Executing a query against `{test_index_str}`")
        response = elastic_client.esql.query(query=query)
        log(f"Got query response: {response}")
        query_columns = response.get("columns", [])
    except BadRequestError as e:
        error_msg = str(e)
        if "parsing_exception" in error_msg:
            raise EsqlSyntaxError(str(e), elastic_client) from None
        if "Unknown column" in error_msg:
            raise EsqlSchemaError(str(e), elastic_client) from None
        if "verification_exception" in error_msg and "unsupported type" in error_msg:
            raise EsqlUnsupportedTypeError(str(e), elastic_client) from None
        if "verification_exception" in error_msg:
            raise EsqlTypeMismatchError(str(e), elastic_client) from None
        raise EsqlKibanaBaseError(str(e), elastic_client) from None
    if delete_indices or not misc.getdefault("skip_empty_index_cleanup")():
        for index_str in test_index_str.split(","):
            response = elastic_client.indices.delete(index=index_str.strip())
            log(f"Test index `{index_str}` deleted: {response}")

    query_column_names = [c["name"] for c in query_columns]
    log(f"Got query columns: {', '.join(query_column_names)}")
    return query_columns, response


def find_nested_multifields(mapping: dict[str, Any], path: str = "") -> list[Any]:
    """Recursively search for nested multi-fields in Elasticsearch mappings."""
    nested_multifields = []

    for field, properties in mapping.items():
        current_path = f"{path}.{field}" if path else field

        if isinstance(properties, dict):
            # Check if the field has a `fields` key
            if "fields" in properties:
                # Check if any subfield in `fields` also has a `fields` key
                for subfield, subproperties in properties["fields"].items():  # type: ignore[reportUnknownVariableType]
                    if isinstance(subproperties, dict) and "fields" in subproperties:
                        nested_multifields.append(f"{current_path}.fields.{subfield}")  # type: ignore[reportUnknownVariableType]

            # Recurse into subfields
            if "properties" in properties:
                nested_multifields.extend(  # type: ignore[reportUnknownVariableType]
                    find_nested_multifields(properties["properties"], current_path)  # type: ignore[reportUnknownVariableType]
                )

    return nested_multifields  # type: ignore[reportUnknownVariableType]


def find_flattened_fields_with_subfields(mapping: dict[str, Any], path: str = "") -> list[str]:
    """Recursively search for type 'flattened' that have a 'fields' or 'properties' key in Elasticsearch mappings."""
    flattened_fields_with_subfields: list[str] = []

    for field, properties in mapping.items():
        current_path = f"{path}.{field}" if path else field

        if isinstance(properties, dict):
            # Check if the field is of type 'flattened' and has a 'fields' key
            if properties.get("type") == "flattened" and "fields" in properties:  # type: ignore[reportUnknownVariableType]
                flattened_fields_with_subfields.append(current_path)  # type: ignore[reportUnknownVariableType]
            # Check if the field is of type 'flattened' and has a 'properties' key
            if properties.get("type") == "flattened" and "properties" in properties:  # type: ignore[reportUnknownVariableType]
                flattened_fields_with_subfields.append(current_path)  # type: ignore[reportUnknownVariableType]

            # Recurse into subfields
            if "properties" in properties:
                flattened_fields_with_subfields.extend(  # type: ignore[reportUnknownVariableType]
                    find_flattened_fields_with_subfields(properties["properties"], current_path)  # type: ignore[reportUnknownVariableType]
                )

    return flattened_fields_with_subfields


def get_ecs_schema_mappings(current_version: Version) -> dict[str, Any]:
    """Get the ECS schema in an index mapping format (nested schema) handling scaled floats."""
    ecs_version = get_stack_schemas()[str(current_version)]["ecs"]
    ecs_schemas = ecs.get_schemas()
    ecs_schema_flattened: dict[str, Any] = {}
    ecs_schema_scaled_floats: dict[str, Any] = {}
    for index, info in ecs_schemas[ecs_version]["ecs_flat"].items():
        if info["type"] == "scaled_float":
            ecs_schema_scaled_floats.update({index: info["scaling_factor"]})
        ecs_schema_flattened.update({index: info["type"]})
    ecs_schema = utils.convert_to_nested_schema(ecs_schema_flattened)
    for index, info in ecs_schema_scaled_floats.items():
        parts = index.split(".")
        current = ecs_schema

        # Traverse the ecs_schema to the correct nested dictionary
        for part in parts[:-1]:  # Traverse all parts except the last one
            current = current.setdefault(part, {}).setdefault("properties", {})

        current[parts[-1]].update({"scaling_factor": info})
    return ecs_schema


def prepare_mappings(  # noqa: PLR0913
    elastic_client: Elasticsearch,
    indices: list[str],
    event_dataset_integrations: list[EventDataset],
    metadata: RuleMeta,
    stack_version: str,
    log: Callable[[str], None],
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
    """Prepare index mappings for the given indices and rule integrations."""
    existing_mappings, index_lookup = get_existing_mappings(elastic_client, indices)

    # Collect mappings for the integrations
    rule_integrations = get_rule_integrations(metadata)

    # Collect mappings for all relevant integrations for the given stack version
    package_manifests = load_integrations_manifests()
    integration_schemas = load_integrations_schemas()

    integration_mappings, integration_index_lookup = prepare_integration_mappings(
        rule_integrations, event_dataset_integrations, package_manifests, integration_schemas, stack_version, log
    )

    index_lookup.update(integration_index_lookup)

    # Load non-ecs schema and convert to index mapping format (nested schema)
    # For non_ecs we need both a mapping and a schema as custom schemas can override non-ecs fields
    # In these cases we need to accept the overwrite keep the original non-ecs field in the schema
    non_ecs_schema: dict[str, Any] = {}
    non_ecs_mapping: dict[str, Any] = {}
    non_ecs = ecs.get_non_ecs_schema()
    for index in indices:
        index_mapping = non_ecs.get(index, {})
        non_ecs_schema.update(index_mapping)
        index_mapping = ecs.flatten(index_mapping)
        index_mapping = utils.convert_to_nested_schema(index_mapping)
        non_ecs_mapping.update({index: index_mapping})

    # These need to be handled separately as we need to be able to validate non-ecs fields as a whole
    # and also at a per index level as custom schemas can override non-ecs fields and/or indices
    non_ecs_schema = ecs.flatten(non_ecs_schema)
    non_ecs_schema = utils.convert_to_nested_schema(non_ecs_schema)
    non_ecs_schema = prune_mappings_of_unsupported_types("non-ecs", non_ecs_schema, log)

    # Load custom schema and convert to index mapping format (nested schema)
    custom_mapping: dict[str, Any] = {}
    custom_indices = ecs.get_custom_schemas()
    for index in indices:
        index_mapping = custom_indices.get(index, {})
        index_mapping = ecs.flatten(index_mapping)
        index_mapping = utils.convert_to_nested_schema(index_mapping)
        custom_mapping.update({index: index_mapping})

    # Load ECS in an index mapping format (nested schema)
    current_version = Version.parse(load_current_package_version(), optional_minor_and_patch=True)
    ecs_schema = get_ecs_schema_mappings(current_version)

    # Filter combined mappings based on the provided indices
    combined_mappings, index_lookup = get_filtered_index_schema(
        indices, index_lookup, ecs_schema, non_ecs_mapping, custom_mapping, log
    )

    index_lookup.update({"rule-ecs-index": ecs_schema})

    if (not integration_mappings or existing_mappings) and not non_ecs_schema and not ecs_schema:
        raise ValueError("No mappings found")
    index_lookup.update({"rule-non-ecs-index": non_ecs_schema})
    utils.combine_dicts(combined_mappings, deepcopy(non_ecs_schema))

    return existing_mappings, index_lookup, combined_mappings