sigma-rules/detection_rules/rule_formatter.py

# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License
# 2.0; you may not use this file except in compliance with the Elastic License
# 2.0.

"""Helper functions for managing rules in the repository."""
import copy
import io
import textwrap
from collections import OrderedDict

import toml

from .schemas import CurrentSchema

SQ = "'"
DQ = '"'
TRIPLE_SQ = SQ * 3
TRIPLE_DQ = DQ * 3


def cleanup_whitespace(val):
    if isinstance(val, str):
        return " ".join(line.strip() for line in val.strip().splitlines())
    return val


def nested_normalize(d, skip_cleanup=False, eql_rule=False):
    if isinstance(d, str):
        return d if skip_cleanup else cleanup_whitespace(d)
    elif isinstance(d, list):
        return [nested_normalize(val, eql_rule=eql_rule) for val in d]
    elif isinstance(d, dict):
        for k, v in d.items():
            if k == 'query':
                # TODO: the linter still needs some work, but once up to par, uncomment to implement - kql.lint(v)
                if eql_rule:
                    # do not normalize eql queries
                    d.update({k: v})
                else:
                    d.update({k: nested_normalize(v)})
            elif k in CurrentSchema.markdown_fields():
                # let these maintain newlines and whitespace for markdown support
                d.update({k: nested_normalize(v, skip_cleanup=True, eql_rule=eql_rule)})
            else:
                d.update({k: nested_normalize(v, eql_rule=eql_rule)})
        return d
    else:
        return d


def wrap_text(v, block_indent=0, join=False):
    """Block and indent a blob of text."""
    v = ' '.join(v.split())
    lines = textwrap.wrap(v, initial_indent=' ' * block_indent, subsequent_indent=' ' * block_indent, width=120,
                          break_long_words=False, break_on_hyphens=False)
    lines = [line + '\n' for line in lines]
    return lines if not join else ''.join(lines)


class NonformattedField(str):
    """Non-formatting class."""


class RuleTomlEncoder(toml.TomlEncoder):
    """Generate a pretty form of toml."""

    def __init__(self, _dict=dict, preserve=False):
        """Create the encoder but override some default functions."""
        super(RuleTomlEncoder, self).__init__(_dict, preserve)
        self._old_dump_str = toml.TomlEncoder().dump_funcs[str]
        self._old_dump_list = toml.TomlEncoder().dump_funcs[list]
        self.dump_funcs[str] = self.dump_str
        self.dump_funcs[type(u"")] = self.dump_str
        self.dump_funcs[list] = self.dump_list
        self.dump_funcs[NonformattedField] = self.dump_str

    def dump_str(self, v):
        """Change the TOML representation to multi-line or single quote when logical."""
        initial_newline = ['\n']

        if isinstance(v, NonformattedField):
            # first line break is not forced like other multiline string dumps
            lines = v.splitlines(True)
            initial_newline = []

        else:
            lines = wrap_text(v)

        multiline = len(lines) > 1
        raw = (multiline or (DQ in v and SQ not in v)) and TRIPLE_DQ not in v

        if multiline:
            if raw:
                return "".join([TRIPLE_DQ] + initial_newline + lines + [TRIPLE_DQ])
            else:
                return "\n".join([TRIPLE_SQ] + [self._old_dump_str(line)[1:-1] for line in lines] + [TRIPLE_SQ])
        elif raw:
            return u"'{:s}'".format(lines[0])
        return self._old_dump_str(v)

    def _dump_flat_list(self, v):
        """A slightly tweaked version of original dump_list, removing trailing commas."""
        if not v:
            return "[]"

        retval = "[" + str(self.dump_value(v[0])) + ","
        for u in v[1:]:
            retval += " " + str(self.dump_value(u)) + ","
        retval = retval.rstrip(',') + "]"
        return retval

    def dump_list(self, v):
        """Dump a list more cleanly."""
        if all([isinstance(d, str) for d in v]) and sum(len(d) + 3 for d in v) > 100:
            dump = []
            for item in v:
                if len(item) > (120 - 4 - 3 - 3) and ' ' in item:
                    dump.append('    """\n{}    """'.format(wrap_text(item, block_indent=4, join=True)))
                else:
                    dump.append(' ' * 4 + self.dump_value(item))
            return '[\n{},\n]'.format(',\n'.join(dump))
        return self._dump_flat_list(v)


def toml_write(rule_contents, outfile=None):
    """Write rule in TOML."""
    def write(text, nl=True):
        if outfile:
            outfile.write(text)
            if nl:
                outfile.write(u"\n")
        else:
            print(text, end='' if not nl else '\n')

    encoder = RuleTomlEncoder()
    contents = copy.deepcopy(rule_contents)
    needs_close = False

    def _do_write(_data, _contents):
        query = None

        if _data == 'rule':
            # - We want to avoid the encoder for the query and instead use kql-lint.
            # - Linting is done in rule.normalize() which is also called in rule.validate().
            # - Until lint has tabbing, this is going to result in all queries being flattened with no wrapping,
            #     but will at least purge extraneous white space
            query = contents['rule'].pop('query', '').strip()

            # - As tags are expanding, we may want to reconsider the need to have them in alphabetical order
            # tags = contents['rule'].get("tags", [])
            #
            # if tags and isinstance(tags, list):
            #     contents['rule']["tags"] = list(sorted(set(tags)))

        top = OrderedDict()
        bottom = OrderedDict()

        for k in sorted(list(_contents)):
            v = _contents.pop(k)

            if isinstance(v, dict):
                bottom[k] = OrderedDict(sorted(v.items()))
            elif isinstance(v, list):
                if any([isinstance(value, (dict, list)) for value in v]):
                    bottom[k] = v
                else:
                    top[k] = v
            elif k in CurrentSchema.markdown_fields():
                top[k] = NonformattedField(v)
            else:
                top[k] = v

        if query:
            top.update({'query': "XXxXX"})

        top.update(bottom)
        top = toml.dumps(OrderedDict({data: top}), encoder=encoder)

        # we want to preserve the query format, but want to modify it in the context of encoded dump
        if query:
            formatted_query = "\nquery = '''\n{}\n'''{}".format(query, '\n\n' if bottom else '')
            top = top.replace('query = "XXxXX"', formatted_query)

        write(top)

    try:

        if outfile and not isinstance(outfile, io.IOBase):
            needs_close = True
            outfile = open(outfile, 'w')

        for data in ('metadata', 'rule'):
            _contents = contents.get(data, {})
            _do_write(data, _contents)

    finally:
        if needs_close:
            outfile.close()