Files
sigma-rules/detection_rules/rule_formatter.py
T

314 lines
13 KiB
Python
Raw Normal View History

2020-06-29 23:17:38 -06:00
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2021-03-03 22:12:11 -09:00
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.
2020-06-29 23:17:38 -06:00
"""Helper functions for managing rules in the repository."""
2020-06-29 23:17:38 -06:00
import copy
import dataclasses
import json
2020-06-29 23:17:38 -06:00
import textwrap
from collections import OrderedDict
from collections.abc import Iterable
from pathlib import Path
from typing import Any, TextIO
2020-06-29 23:17:38 -06:00
import toml
from .schemas import definitions
from .utils import cached
2020-06-29 23:17:38 -06:00
SQ = "'"
DQ = '"'
TRIPLE_SQ = SQ * 3
TRIPLE_DQ = DQ * 3
@cached
def get_preserved_fmt_fields() -> set[str]:
from .rule import BaseRuleData
preserved_keys: set[str] = set()
for field in dataclasses.fields(BaseRuleData):
if field.type in (definitions.Markdown, definitions.Markdown | None):
preserved_keys.add(field.metadata.get("data_key", field.name))
return preserved_keys
def cleanup_whitespace(val: Any) -> Any:
2020-06-29 23:17:38 -06:00
if isinstance(val, str):
return " ".join(line.strip() for line in val.strip().splitlines())
return val
def nested_normalize(d: Any, skip_cleanup: bool = False) -> Any:
preserved_fields = get_preserved_fmt_fields()
2020-06-29 23:17:38 -06:00
if isinstance(d, str):
return d if skip_cleanup else cleanup_whitespace(d)
if isinstance(d, list):
return [nested_normalize(val) for val in d] # type: ignore[reportUnknownVariableType]
if isinstance(d, dict):
for k, v in d.items(): # type: ignore[reportUnknownVariableType]
if k == "query":
# the linter still needs some work, but once up to par, uncomment to implement - kql.lint(v)
2021-06-01 08:31:36 -08:00
# do not normalize queries
d.update({k: v}) # type: ignore[reportUnknownMemberType]
elif k in preserved_fields:
2020-06-29 23:17:38 -06:00
# let these maintain newlines and whitespace for markdown support
d.update({k: nested_normalize(v, skip_cleanup=True)}) # type: ignore[reportUnknownMemberType]
2020-06-29 23:17:38 -06:00
else:
d.update({k: nested_normalize(v)}) # type: ignore[reportUnknownMemberType]
return d # type: ignore[reportUnknownVariableType]
return d
2020-06-29 23:17:38 -06:00
def wrap_text(v: str, block_indent: int = 0) -> list[str]:
2020-06-29 23:17:38 -06:00
"""Block and indent a blob of text."""
v = " ".join(v.split())
lines = textwrap.wrap(
v,
initial_indent=" " * block_indent,
subsequent_indent=" " * block_indent,
width=120,
break_long_words=False,
break_on_hyphens=False,
)
lines = [line + "\n" for line in lines]
2024-08-06 18:07:12 -04:00
# If there is a single line that contains a quote, add a new blank line to trigger multiline formatting
if len(lines) == 1 and '"' in lines[0]:
lines = [*lines, ""]
return lines
2020-06-29 23:17:38 -06:00
def wrap_text_and_join(v: str, block_indent: int = 0) -> str:
lines = wrap_text(v, block_indent=block_indent)
return "".join(lines)
class NonformattedField(str): # noqa: SLOT000
2020-06-29 23:17:38 -06:00
"""Non-formatting class."""
def preserve_formatting_for_fields(data: OrderedDict[str, Any], fields_to_preserve: list[str]) -> OrderedDict[str, Any]:
2024-04-04 15:50:48 -05:00
"""Preserve formatting for specified nested fields in an action."""
def apply_preservation(target: OrderedDict[str, Any], keys: list[str]) -> None:
2024-04-04 15:50:48 -05:00
"""Apply NonformattedField preservation based on keys path."""
for key in keys[:-1]:
# Iterate to the key, diving into nested dictionaries
if key in target and isinstance(target[key], dict):
target = target[key]
else:
# Cannot preserve formatting for missing or non-dict intermediate
return
final_key = keys[-1]
if final_key in target:
# Apply NonformattedField to the target field if it exists
target[final_key] = NonformattedField(target[final_key])
for field_path in fields_to_preserve:
keys = field_path.split(".")
2024-04-04 15:50:48 -05:00
apply_preservation(data, keys)
return data
class RuleTomlEncoder(toml.TomlEncoder): # type: ignore[reportMissingTypeArgument]
2020-06-29 23:17:38 -06:00
"""Generate a pretty form of toml."""
def __init__(self, *args: Any, **kwargs: Any) -> None:
2020-06-29 23:17:38 -06:00
"""Create the encoder but override some default functions."""
super().__init__(*args, **kwargs) # type: ignore[reportUnknownMemberType]
2020-06-29 23:17:38 -06:00
self._old_dump_list = toml.TomlEncoder().dump_funcs[list]
self.dump_funcs[str] = self.dump_str
self.dump_funcs[str] = self.dump_str
2020-06-29 23:17:38 -06:00
self.dump_funcs[list] = self.dump_list
self.dump_funcs[NonformattedField] = self.dump_str
def dump_str(self, v: str | NonformattedField) -> str:
2020-06-29 23:17:38 -06:00
"""Change the TOML representation to multi-line or single quote when logical."""
initial_newline = ["\n"]
2020-06-29 23:17:38 -06:00
if isinstance(v, NonformattedField):
# first line break is not forced like other multiline string dumps
lines = v.splitlines(True)
initial_newline = []
else:
lines = wrap_text(v)
multiline = len(lines) > 1
raw = (multiline or (DQ in v and SQ not in v)) and TRIPLE_DQ not in v
if multiline:
if raw:
return "".join([TRIPLE_DQ, *initial_newline, *lines, TRIPLE_DQ])
return "\n".join([TRIPLE_SQ] + [json.dumps(line)[1:-1] for line in lines] + [TRIPLE_SQ])
if raw:
return f"'{lines[0]:s}'"
# In the toml library there is a magic replace for \\\\x -> u00 that we wish to avoid until #4979 is resolved
# Also addresses an issue where backslashes in certain strings are not properly escaped in self._old_dump_str(v)
return json.dumps(v)
2020-06-29 23:17:38 -06:00
def _dump_flat_list(self, v: Iterable[Any]) -> str:
2020-06-29 23:17:38 -06:00
"""A slightly tweaked version of original dump_list, removing trailing commas."""
if not v:
return "[]"
v_list = list(v)
retval = "[" + str(self.dump_value(v_list[0])) + ","
for u in v_list[1:]:
2020-06-29 23:17:38 -06:00
retval += " " + str(self.dump_value(u)) + ","
return retval.rstrip(",") + "]"
2020-06-29 23:17:38 -06:00
def dump_list(self, v: Iterable[Any]) -> str:
2020-06-29 23:17:38 -06:00
"""Dump a list more cleanly."""
if all(isinstance(d, str) for d in v) and sum(len(d) + 3 for d in v) > 100: # noqa: PLR2004
dump: list[str] = []
2020-06-29 23:17:38 -06:00
for item in v:
if len(item) > (120 - 4 - 3 - 3) and " " in item:
dump.append(f' """\n{wrap_text_and_join(item, block_indent=4)} """')
2020-06-29 23:17:38 -06:00
else:
dump.append(" " * 4 + self.dump_value(item))
return "[\n{},\n]".format(",\n".join(dump))
2024-09-17 13:25:17 -05:00
if v and all(isinstance(i, dict) for i in v):
# Compact inline format for lists of dictionaries with proper indentation
retval = "\n" + " " * 2 + "[\n"
retval += ",\n".join([" " * 4 + self.dump_inline_table(u).strip() for u in v])
retval += "\n" + " " * 2 + "]\n"
return retval
2020-06-29 23:17:38 -06:00
return self._dump_flat_list(v)
def toml_write(rule_contents: dict[str, Any], out_file_path: Path | None = None) -> None: # noqa: PLR0915
2020-06-29 23:17:38 -06:00
"""Write rule in TOML."""
encoder = RuleTomlEncoder()
contents = copy.deepcopy(rule_contents)
def order_rule(obj: Any) -> Any:
if isinstance(obj, dict):
obj = OrderedDict(sorted(obj.items())) # type: ignore[reportUnknownArgumentType, reportUnknownVariableType]
for k, v in obj.items():
if isinstance(v, dict | list):
obj[k] = order_rule(v)
if isinstance(obj, list):
for i, v in enumerate(obj): # type: ignore[reportUnknownMemberType]
if isinstance(v, dict | list):
obj[i] = order_rule(v)
obj = sorted(obj, key=lambda x: json.dumps(x)) # type: ignore[reportUnknownArgumentType, reportUnknownVariableType]
return obj
def _do_write(f: TextIO | None, _data: str, _contents: dict[str, Any]) -> None: # noqa: PLR0912
2020-06-29 23:17:38 -06:00
query = None
2024-08-06 18:07:12 -04:00
threat_query = None
2020-06-29 23:17:38 -06:00
if _data == "rule":
2020-06-29 23:17:38 -06:00
# - We want to avoid the encoder for the query and instead use kql-lint.
# - Linting is done in rule.normalize() which is also called in rule.validate().
# - Until lint has tabbing, this is going to result in all queries being flattened with no wrapping,
# but will at least purge extraneous white space
query = contents["rule"].pop("query", "").strip()
2020-06-29 23:17:38 -06:00
# - As tags are expanding, we may want to reconsider the need to have them in alphabetical order
threat_query = contents["rule"].pop("threat_query", "").strip()
2020-06-29 23:17:38 -06:00
top: OrderedDict[str, Any] = OrderedDict()
bottom: OrderedDict[str, Any] = OrderedDict()
2020-06-29 23:17:38 -06:00
for k in sorted(_contents):
2020-06-29 23:17:38 -06:00
v = _contents.pop(k)
if k == "actions":
2024-04-04 15:50:48 -05:00
# explicitly preserve formatting for message field in actions
preserved_fields = ["params.message"]
2024-08-06 18:07:12 -04:00
v = [preserve_formatting_for_fields(action, preserved_fields) for action in v] if v is not None else []
2024-04-04 15:50:48 -05:00
if k == "filters":
# explicitly preserve formatting for value field in filters
preserved_fields = ["meta.value"]
2024-08-06 18:07:12 -04:00
v = [preserve_formatting_for_fields(meta, preserved_fields) for meta in v] if v is not None else []
if k == "note" and isinstance(v, str):
2024-05-13 15:00:01 -04:00
# Transform instances of \ to \\ as calling write will convert \\ to \.
# This will ensure that the output file has the correct number of backslashes.
v = v.replace("\\", "\\\\")
if k == "setup" and isinstance(v, str):
2024-08-06 18:07:12 -04:00
# Transform instances of \ to \\ as calling write will convert \\ to \.
# This will ensure that the output file has the correct number of backslashes.
v = v.replace("\\", "\\\\")
if k == "description" and isinstance(v, str):
2024-08-06 18:07:12 -04:00
# Transform instances of \ to \\ as calling write will convert \\ to \.
# This will ensure that the output file has the correct number of backslashes.
v = v.replace("\\", "\\\\")
if k == "osquery" and isinstance(v, list):
2025-01-22 11:17:38 -06:00
# Specifically handle transform.osquery queries
for osquery_item in v: # type: ignore[reportUnknownVariableType]
if "query" in osquery_item and isinstance(osquery_item["query"], str):
2025-01-22 11:17:38 -06:00
# Transform instances of \ to \\ as calling write will convert \\ to \.
# This will ensure that the output file has the correct number of backslashes.
osquery_item["query"] = osquery_item["query"].replace("\\", "\\\\") # type: ignore[reportUnknownMemberType]
2025-01-22 11:17:38 -06:00
2020-06-29 23:17:38 -06:00
if isinstance(v, dict):
bottom[k] = OrderedDict(sorted(v.items())) # type: ignore[reportUnknownArgumentType]
2020-06-29 23:17:38 -06:00
elif isinstance(v, list):
if any(isinstance(value, (dict | list)) for value in v): # type: ignore[reportUnknownArgumentType]
2020-06-29 23:17:38 -06:00
bottom[k] = v
else:
top[k] = v
elif k in get_preserved_fmt_fields():
2020-06-29 23:17:38 -06:00
top[k] = NonformattedField(v)
else:
top[k] = v
if query:
top.update({"query": "XXxXX"}) # type: ignore[reportUnknownMemberType]
2020-06-29 23:17:38 -06:00
2024-08-06 18:07:12 -04:00
if threat_query:
top.update({"threat_query": "XXxXX"}) # type: ignore[reportUnknownMemberType]
2024-08-06 18:07:12 -04:00
top.update(bottom) # type: ignore[reportUnknownMemberType]
top_out = toml.dumps(OrderedDict({data: top}), encoder=encoder) # type: ignore[reportUnknownMemberType]
2020-06-29 23:17:38 -06:00
2024-08-06 18:07:12 -04:00
# we want to preserve the threat_query format, but want to modify it in the context of encoded dump
if threat_query:
formatted_threat_query = "\nthreat_query = '''\n{}\n'''{}".format(threat_query, "\n\n" if bottom else "")
top_out = top_out.replace('threat_query = "XXxXX"', formatted_threat_query)
2024-08-06 18:07:12 -04:00
2020-06-29 23:17:38 -06:00
# we want to preserve the query format, but want to modify it in the context of encoded dump
if query:
formatted_query = "\nquery = '''\n{}\n'''{}".format(query, "\n\n" if bottom else "")
top_out = top_out.replace('query = "XXxXX"', formatted_query)
2020-06-29 23:17:38 -06:00
if f:
_ = f.write(top_out + "\n")
else:
print(top_out)
2020-06-29 23:17:38 -06:00
f = None
if out_file_path:
f = out_file_path.open("w")
2020-06-29 23:17:38 -06:00
try:
for data in ("metadata", "transform", "rule"):
2020-06-29 23:17:38 -06:00
_contents = contents.get(data, {})
2023-03-28 07:17:50 -06:00
if not _contents:
continue
order_rule(_contents)
_do_write(f, data, _contents)
2020-06-29 23:17:38 -06:00
finally:
if f:
f.close()