[Bug] KQL fails validation on uppercase keywords (#3568)

* add todo * Add a normalize_kql_keywords function to utils * update rule loader to normalize and warn * optimized loading * fix linting * Moved conversion to kql module. * Updated unit test * Refactor KQL parser to normalize keywords via flag * Fix logic typo * Update detection_rules/utils.py Co-authored-by: Justin Ibarra <16747370+brokensound77@users.noreply.github.com> * Update lib/kql/kql/__init__.py Co-authored-by: Justin Ibarra <16747370+brokensound77@users.noreply.github.com> * Updated to fix unit tests and remove warnings * linting typo * Added comments * remove unused imports * Update kql.parse default --------- Co-authored-by: Justin Ibarra <16747370+brokensound77@users.noreply.github.com> Co-authored-by: Mika Ayenson <Mikaayenson@users.noreply.github.com> (cherry picked from commit 1566c29bae)
2024-04-04 18:03:30 -04:00
parent 07204987f2
commit c6df1d085f
7 changed files with 25 additions and 12 deletions
@@ -1405,7 +1405,7 @@ def get_unique_query_fields(rule: TOMLRule) -> List[str]:

        cfg = set_eql_config(rule.contents.metadata.get('min_stack_version'))
        with eql.parser.elasticsearch_syntax, eql.parser.ignore_missing_functions, eql.parser.skip_optimizations, cfg:
-            parsed = kql.parse(query) if language == 'kuery' else eql.parse_query(query)
+            parsed = kql.parse(query, normalize_kql_keywords=True) if language == 'kuery' else eql.parse_query(query)

        return sorted(set(str(f) for f in parsed if isinstance(f, (eql.ast.Field, kql.ast.Field))))

@@ -36,7 +36,7 @@ class KQLValidator(QueryValidator):

    @cached_property
    def ast(self) -> kql.ast.Expression:
-        return kql.parse(self.query)
+        return kql.parse(self.query, normalize_kql_keywords=True)

    @cached_property
    def unique_fields(self) -> List[str]:
@@ -80,7 +80,7 @@ class KQLValidator(QueryValidator):
                                                                    beats_version, ecs_version)

            try:
-                kql.parse(self.query, schema=schema)
+                kql.parse(self.query, schema=schema, normalize_kql_keywords=True)
            except kql.KqlParseError as exc:
                message = exc.error_msg
                trailer = err_trailer
@@ -135,7 +135,7 @@ class KQLValidator(QueryValidator):

            # Validate the query against the schema
            try:
-                kql.parse(self.query, schema=integration_schema)
+                kql.parse(self.query, schema=integration_schema, normalize_kql_keywords=True)
            except kql.KqlParseError as exc:
                if exc.error_msg == "Unknown field":
                    field = extract_error_field(self.query, exc)
@@ -241,7 +241,7 @@ def convert_time_span(span: str) -> int:

 def evaluate(rule, events):
    """Evaluate a query against events."""
-    evaluator = kql.get_evaluator(kql.parse(rule.query))
+    evaluator = kql.get_evaluator(kql.parse(rule.query, normalize_kql_keywords=True))
    filtered = list(filter(evaluator, events))
    return filtered

@@ -45,12 +45,12 @@ def to_eql(text, optimize=True, schema=None):
    return converted.optimize(recursive=True) if optimize else converted


-def parse(text, optimize=True, schema=None):
+def parse(text, optimize: bool = True, schema: dict = None, normalize_kql_keywords: bool = False):
    if isinstance(text, bytes):
        text = text.decode("utf-8")

    lark_parsed = lark_parse(text)
-    converted = KqlParser(text, schema=schema).visit(lark_parsed)
+    converted = KqlParser(text, schema=schema, normalize_kql_keywords=normalize_kql_keywords).visit(lark_parsed)

    return converted.optimize(recursive=True) if optimize else converted

@@ -104,22 +104,29 @@ class BaseKqlParser(Interpreter):
    quoted_escapes = {"\\t": "\t", "\\r": "\r", "\\n": "\n", "\\\\": "\\", "\\\"": "\""}
    quoted_regex = re.compile("(" + "|".join(re.escape(e) for e in sorted(quoted_escapes)) + ")")

-    def __init__(self, text, schema=None):
+    def __init__(self, text: str, schema: dict = None, normalize_kql_keywords: bool = True) -> None:
+        """Initialize the parser. Defaults to normalizing KQL keywords to lowercase."""
        self.text = text
        self.lines = [t.rstrip("\r\n") for t in self.text.splitlines(True)]
        self.scoped_field = None
        self.mapping_schema = schema
        self.star_fields = []
+        self.normalize_kql_keywords = normalize_kql_keywords

        if schema:
            for field, field_type in schema.items():
                if "*" in field:
                    self.star_fields.append(wildcard2regex(field))

-    def assert_lower_token(self, *tokens):
+    def assert_lower_token(self, *tokens: Token) -> None:
+        """Assert that the token is lowercase and converts token if not."""
        for token in tokens:
-            if str(token) != str(token).lower():
-                raise self.error(token, "Expected '{lower}' but got '{token}'".format(token=token, lower=str(token).lower()))
+            lower_token = str(token).lower()
+            if str(token) != lower_token:
+                if self.normalize_kql_keywords:
+                    token.value = lower_token
+                else:
+                    raise self.error(token, f"Expected '{lower_token}' but got '{token}'")

    def error(self, node, message, end=False, cls=KqlParseError, width=None, **kwargs):
        """Generate an error exception but dont raise it."""
@@ -34,6 +34,12 @@ class LintTests(unittest.TestCase):
            with self.assertRaises(kql.KqlParseError):
                kql.parse(q)

+        for q in queries:
+            # Test query successfully converts and parses
+            parsed_query = kql.parse(q, normalize_kql_keywords=True)
+            # Test that the parsed query is not equal to the original query, that the transformation was applied
+            self.assertNotEqual(str(parsed_query), q, f"Parsed query {parsed_query} matches the original {q}")
+
    def test_lint_precedence(self):
        self.validate("a:b or (c:d and e:f)", "a:b or c:d and e:f")
        self.validate("(a:b and (c:d or e:f))", "a:b and (c:d or e:f)")
@@ -67,7 +67,7 @@ class TestValidRules(BaseRuleTest):
                )
            ):
                source = rule.contents.data.query
-                tree = kql.parse(source, optimize=False)
+                tree = kql.parse(source, optimize=False, normalize_kql_keywords=True)
                optimized = tree.optimize(recursive=True)
                err_message = f'\n{self.rule_str(rule)} Query not optimized for rule\n' \
                              f'Expected: {optimized}\nActual: {source}'