From 324005a12612cf821289520f8ce1f418e96cc568 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Tue, 12 Nov 2019 11:46:43 +0100 Subject: [PATCH 1/3] [feature] extend es-dsl to support nested aggregations --- tests/test-backend-es-qs.py | 3 +- tools/sigma/backends/elasticsearch.py | 88 +++++++++++++++++++++------ tools/sigma/parser/condition.py | 21 ++++++- 3 files changed, 91 insertions(+), 21 deletions(-) diff --git a/tests/test-backend-es-qs.py b/tests/test-backend-es-qs.py index 4fc0a6359..5bb10d2fe 100755 --- a/tests/test-backend-es-qs.py +++ b/tests/test-backend-es-qs.py @@ -16,7 +16,6 @@ # along with this program. If not, see . import asyncio -import functools import sys import pprint import elasticsearch @@ -41,6 +40,7 @@ except elasticsearch.exceptions.RequestError as e: queries = asyncio.Queue() + # sigmac runner coroutinne async def run_sigmac(): sigmac = asyncio.create_subprocess_exec( @@ -70,6 +70,7 @@ async def run_sigmac(): print("* sigmac returned with exit code {}".format(exitcode)) return exitcode + # Generated query checker loop async def check_queries(): failed = list() diff --git a/tools/sigma/backends/elasticsearch.py b/tools/sigma/backends/elasticsearch.py index 423b93dc4..dbc14cf2e 100644 --- a/tools/sigma/backends/elasticsearch.py +++ b/tools/sigma/backends/elasticsearch.py @@ -173,9 +173,6 @@ class ElasticsearchDSLBackend(RulenameCommentMixin, ElasticsearchWildcardHandlin self.queries[-1]['query']['constant_score']['filter'] = self.generateNode(parsed.parsedSearch) if parsed.parsedAgg: self.generateAggregation(parsed.parsedAgg) - # if parsed.parsedAgg: - # fields += self.generateAggregation(parsed.parsedAgg) - # self.fields.update(fields) def generateANDNode(self, node): andNode = {'bool': {'must': []}} @@ -253,32 +250,83 @@ class ElasticsearchDSLBackend(RulenameCommentMixin, ElasticsearchWildcardHandlin return {'exists': {'field': node.item}} def generateAggregation(self, agg): + """ + Generates an Elasticsearch nested aggregation given a SigmaAggregationParser object + + Two conditions are handled here: + a) "count() by MyGroupedField > X" + b) "count(MyDistinctFieldName) by MyGroupedField > X' + + The case (b) is translated to a the following equivalent SQL query + + ``` + SELECT MyDistinctFieldName, COUNT(DISTINCT MyDistinctFieldName) FROM Table + GROUP BY MyGroupedField HAVING COUNT(DISTINCT MyDistinctFieldName) > 1 + ``` + + The resulting aggregation is set on 'self.queries[-1]["aggs"]' as a Python dict + + :param agg: Input SigmaAggregationParser object that defines a condition + :return: None + """ if agg: if agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_COUNT: if agg.groupfield is not None: - self.queries[-1]['aggs'] = { - '%s_count'%(agg.groupfield or ""): { - 'terms': { - 'field': '%s'%(agg.groupfield + ".keyword" or "") - }, - 'aggs': { - 'limit': { - 'bucket_selector': { - 'buckets_path': { - 'count': '%s_count'%(agg.groupfield or "") + # If the aggregation is 'count(MyDistinctFieldName) by MyGroupedField > XYZ' + if agg.aggfield is not None: + count_agg_group_name = "{}_count".format(agg.groupfield) + count_distinct_agg_name = "{}_distinct".format(agg.aggfield) + script_limit = "params.count {} {}".format(agg.cond_op, agg.condition) + self.queries[-1]['aggs'] = { + "aggs": { + count_agg_group_name: { + "terms": { + "field": agg.groupfield + }, + "aggs": { + count_distinct_agg_name: { + "cardinality": { + "field": agg.aggfield + } }, - 'script': 'params.count %s %s'%(agg.cond_op, agg.condition) + "limit": { + "bucket_selector": { + "buckets_path": { + "count": count_distinct_agg_name + }, + 'script': script_limit + } + } + } + } + } + } + else: # if the condition is count() by MyGroupedField > XYZ + group_aggname = "%s_count".format(agg.groupfield) + self.queries[-1]['aggs'] = { + group_aggname: { + 'terms': { + 'field': '%s' % (agg.groupfield + ".keyword") + }, + 'aggs': { + 'limit': { + 'bucket_selector': { + 'buckets_path': { + 'count': group_aggname + }, + 'script': 'params.count %s %s' % (agg.cond_op, agg.condition) + } } } } } - } else: + funcname = "" for name, idx in agg.aggfuncmap.items(): if idx == agg.aggfunc: funcname = name break - raise NotImplementedError("%s : The '%s' aggregation operator is not yet implemented for this backend"%(self.title, funcname)) + raise NotImplementedError("%s : The '%s' aggregation operator is not yet implemented for this backend" % (self.title, funcname)) def generateBefore(self, parsed): self.queries.append({'query': {'constant_score': {'filter': {}}}}) @@ -895,14 +943,18 @@ class ElastalertBackend(MultiRuleOutputMixin): def generateAggregation(self, agg): if agg: - if agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_COUNT or agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_MIN or agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_MAX or agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_AVG or agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_SUM: + if agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_COUNT or \ + agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_MIN or \ + agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_MAX or \ + agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_AVG or \ + agg.aggfunc == sigma.parser.condition.SigmaAggregationParser.AGGFUNC_SUM: return "" else: for name, idx in agg.aggfuncmap.items(): if idx == agg.aggfunc: funcname = name break - raise NotImplementedError("%s : The '%s' aggregation operator is not yet implemented for this backend"%(self.title, funcname)) + raise NotImplementedError("%s : The '%s' aggregation operator is not yet implemented for this backend" % ( self.title, funcname)) def convertLevel(self, level): return { diff --git a/tools/sigma/parser/condition.py b/tools/sigma/parser/condition.py index db40575d3..21149a84d 100644 --- a/tools/sigma/parser/condition.py +++ b/tools/sigma/parser/condition.py @@ -24,6 +24,7 @@ COND_OR = 2 COND_NOT = 3 COND_NULL = 4 + # Debugging code def dumpNode(node, indent=''): # pragma: no cover """ @@ -42,6 +43,7 @@ def dumpNode(node, indent=''): # pragma: no cover repr(node))) return node + # Condition Tokenizer class SigmaConditionToken: """Token of a Sigma condition expression""" @@ -100,6 +102,7 @@ class SigmaConditionToken: def __str__(self): # pragma: no cover return "[ Token: %s: '%s' ]" % (self.tokenstr[self.type], self.matched) + class SigmaConditionTokenizer: """Tokenize condition string into token sequence""" tokendefs = [ # list of tokens, preferred recognition in given order, (token identifier, matching regular expression). Ignored if token id == None @@ -172,6 +175,7 @@ class SigmaConditionTokenizer: def index(self, item): return self.tokens.index(item) + ### Parse Tree Node Classes ### class ParseTreeNode: """Parse Tree Node Base Class""" @@ -181,6 +185,7 @@ class ParseTreeNode: def __str__(self): # pragma: no cover return "[ %s: %s ]" % (self.__doc__, str([str(item) for item in self.items])) + class ConditionBase(ParseTreeNode): """Base class for conditional operations""" op = COND_NONE @@ -198,6 +203,7 @@ class ConditionBase(ParseTreeNode): def __len__(self): return len(self.items) + class ConditionAND(ConditionBase): """AND Condition""" op = COND_AND @@ -208,10 +214,12 @@ class ConditionAND(ConditionBase): else: # called by parser, use given values self.items = args + class ConditionOR(ConditionAND): """OR Condition""" op = COND_OR + class ConditionNOT(ConditionBase): """NOT Condition""" op = COND_NOT @@ -235,19 +243,23 @@ class ConditionNOT(ConditionBase): except IndexError: return None + class ConditionNULLValue(ConditionNOT): """Condition: Field value is empty or doesn't exists""" pass + class ConditionNotNULLValue(ConditionNULLValue): """Condition: Field value is not empty""" pass + class NodeSubexpression(ParseTreeNode): """Subexpression""" def __init__(self, subexpr): self.items = subexpr + # Parse tree generators: generate parse tree nodes from extended conditions def generateXOf(sigma, val, condclass): """ @@ -274,18 +286,22 @@ def generateXOf(sigma, val, condclass): else: # OR across all items of definition return NodeSubexpression(sigma.parse_definition_byname(val.matched, condclass)) + def generateAllOf(sigma, op, val): """Convert 'all of x' expressions into ConditionAND""" return generateXOf(sigma, val, ConditionAND) + def generateOneOf(sigma, op, val): """Convert '1 of x' expressions into ConditionOR""" return generateXOf(sigma, val, ConditionOR) + def convertId(sigma, op): """Convert search identifiers (lists or maps) into condition nodes according to spec defaults""" return NodeSubexpression(sigma.parse_definition_byname(op.matched)) + # Optimizer class SigmaConditionOptimizer: """ @@ -548,7 +564,8 @@ class SigmaConditionParser: def __len__(self): # pragma: no cover return len(self.parsedSearch) - + + # Aggregation parser class SigmaAggregationParser(SimpleParser): """Parse Sigma aggregation expression and provide parsed data""" @@ -599,7 +616,7 @@ class SigmaAggregationParser(SimpleParser): SigmaConditionToken.TOKEN_ID: (None, "store_search_id", 9), }, ] - finalstates = { -1, 9 } + finalstates = {-1, 9} # Aggregation functions AGGFUNC_COUNT = 1 From e7ed0fa9ea3f02a8bd7d89b058e9b2c5abb3076e Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Tue, 12 Nov 2019 14:06:10 +0100 Subject: [PATCH 2/3] added unit test --- tools/requirements-devel.txt | 1 + tools/sigma/backends/elasticsearch.py | 2 +- tools/tests/test_backend_elasticsearch.py | 30 +++++++++++++++++++++++ tools/tests/test_parsing.py | 2 ++ 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 tools/tests/test_backend_elasticsearch.py diff --git a/tools/requirements-devel.txt b/tools/requirements-devel.txt index 219522f2b..a946d533e 100644 --- a/tools/requirements-devel.txt +++ b/tools/requirements-devel.txt @@ -6,3 +6,4 @@ elasticsearch-async setuptools wheel pymisp +pytest diff --git a/tools/sigma/backends/elasticsearch.py b/tools/sigma/backends/elasticsearch.py index dbc14cf2e..fed70bc2d 100644 --- a/tools/sigma/backends/elasticsearch.py +++ b/tools/sigma/backends/elasticsearch.py @@ -294,7 +294,7 @@ class ElasticsearchDSLBackend(RulenameCommentMixin, ElasticsearchWildcardHandlin "buckets_path": { "count": count_distinct_agg_name }, - 'script': script_limit + "script": script_limit } } } diff --git a/tools/tests/test_backend_elasticsearch.py b/tools/tests/test_backend_elasticsearch.py new file mode 100644 index 000000000..b984ff70d --- /dev/null +++ b/tools/tests/test_backend_elasticsearch.py @@ -0,0 +1,30 @@ +from sigma.backends.elasticsearch import ElasticsearchDSLBackend +from sigma.configuration import SigmaConfiguration +from sigma.parser.condition import SigmaAggregationParser + + +def test_backend_elastic(): + sigma_config = SigmaConfiguration() + backend = ElasticsearchDSLBackend(sigma_config) + + # setup the aggregator input object without calling __init__() + agg = object.__new__(SigmaAggregationParser) + agg.condition = "3" + agg.cond_op = "<" + agg.aggfunc = SigmaAggregationParser.AGGFUNC_COUNT + agg.aggfield = "aggfield" + agg.groupfield = "groupfield" + + # Make queries non-empty + backend.queries = [{}] + + backend.generateAggregation(agg) + + assert len(backend.queries) == 1, "backend has exactly one query" + assert ( + "groupfield_count" in backend.queries[0]["aggs"]["aggs"] + ), "groupfield_count is the top aggregation key" + assert ( + "aggfield_distinct" + in backend.queries[0]["aggs"]["aggs"]["groupfield_count"]["aggs"] + ), "aggfield_distinct is the nested aggregation key" diff --git a/tools/tests/test_parsing.py b/tools/tests/test_parsing.py index eeb0d7ea9..4d3d1588e 100644 --- a/tools/tests/test_parsing.py +++ b/tools/tests/test_parsing.py @@ -1 +1,3 @@ + def test_collection(): + pass From 3c7f522017d2ed5f5f8f25981a08b7207bc81952 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 14 Nov 2019 14:34:50 +0100 Subject: [PATCH 3/3] add .keyword on aggs; add extra unit test --- tools/sigma/backends/elasticsearch.py | 10 ++--- tools/tests/test_backend_elasticsearch.py | 51 +++++++++++++++++++---- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/tools/sigma/backends/elasticsearch.py b/tools/sigma/backends/elasticsearch.py index fed70bc2d..6cf595cc3 100644 --- a/tools/sigma/backends/elasticsearch.py +++ b/tools/sigma/backends/elasticsearch.py @@ -278,15 +278,14 @@ class ElasticsearchDSLBackend(RulenameCommentMixin, ElasticsearchWildcardHandlin count_distinct_agg_name = "{}_distinct".format(agg.aggfield) script_limit = "params.count {} {}".format(agg.cond_op, agg.condition) self.queries[-1]['aggs'] = { - "aggs": { - count_agg_group_name: { + count_agg_group_name: { "terms": { - "field": agg.groupfield + "field": "{}.keyword".format(agg.groupfield) }, "aggs": { count_distinct_agg_name: { "cardinality": { - "field": agg.aggfield + "field": "{}.keyword".format(agg.aggfield) } }, "limit": { @@ -300,9 +299,8 @@ class ElasticsearchDSLBackend(RulenameCommentMixin, ElasticsearchWildcardHandlin } } } - } else: # if the condition is count() by MyGroupedField > XYZ - group_aggname = "%s_count".format(agg.groupfield) + group_aggname = "{}_count".format(agg.groupfield) self.queries[-1]['aggs'] = { group_aggname: { 'terms': { diff --git a/tools/tests/test_backend_elasticsearch.py b/tools/tests/test_backend_elasticsearch.py index b984ff70d..3da03eb7d 100644 --- a/tools/tests/test_backend_elasticsearch.py +++ b/tools/tests/test_backend_elasticsearch.py @@ -4,6 +4,11 @@ from sigma.parser.condition import SigmaAggregationParser def test_backend_elastic(): + """ + Test aggregation of the form + + count(aggfield) by GroupField < 3 + """ sigma_config = SigmaConfiguration() backend = ElasticsearchDSLBackend(sigma_config) @@ -13,18 +18,48 @@ def test_backend_elastic(): agg.cond_op = "<" agg.aggfunc = SigmaAggregationParser.AGGFUNC_COUNT agg.aggfield = "aggfield" - agg.groupfield = "groupfield" + agg.groupfield = "GroupField" # Make queries non-empty backend.queries = [{}] backend.generateAggregation(agg) + inner_agg = backend.queries[0]["aggs"]["GroupField_count"]["aggs"] + bucket_selector = backend.queries[0]["aggs"]["GroupField_count"]["aggs"]["limit"]["bucket_selector"] assert len(backend.queries) == 1, "backend has exactly one query" - assert ( - "groupfield_count" in backend.queries[0]["aggs"]["aggs"] - ), "groupfield_count is the top aggregation key" - assert ( - "aggfield_distinct" - in backend.queries[0]["aggs"]["aggs"]["groupfield_count"]["aggs"] - ), "aggfield_distinct is the nested aggregation key" + assert ("GroupField_count" in backend.queries[0]["aggs"]), "GroupField_count is the top aggregation key" + assert ("aggfield_distinct" in backend.queries[0]["aggs"]["GroupField_count"]["aggs"]), "aggfield_distinct is the nested aggregation key" + assert ("GroupField_count" in backend.queries[0]["aggs"]), "GroupField_count is the top aggregation key" + assert "{}.keyword".format(agg.aggfield) == inner_agg["aggfield_distinct"]["cardinality"]["field"], "inner agg field must have suffix .keyword" + assert ("params.count < 3" in bucket_selector["script"]), "bucket selector script must be 'params.count < 3'" + assert "count" in bucket_selector["buckets_path"], "buckets_path must be 'count'" + + +def test_backend_elastic_count_nofield_agg(): + """ + Test aggregation of the form + + count() by GroupedField < 3 + """ + + sigma_config = SigmaConfiguration() + backend = ElasticsearchDSLBackend(sigma_config) + + # setup the aggregator input object without calling __init__() + agg = object.__new__(SigmaAggregationParser) + agg.condition = "3" + agg.cond_op = "<" + agg.aggfunc = SigmaAggregationParser.AGGFUNC_COUNT + agg.aggfield = None + agg.groupfield = "GroupedField" + + # Make queries non-empty + backend.queries = [{}] + backend.generateAggregation(agg) + bucket_selector = backend.queries[0]["aggs"]["GroupedField_count"]["aggs"]["limit"]["bucket_selector"] + + assert len(backend.queries) == 1, "backend has exactly one query" + assert ("GroupedField_count" in backend.queries[0]["aggs"]), "GroupedField_count is the top aggregation key" + assert ("params.count < 3" in bucket_selector["script"]), "bucket selector script must be 'params.count < 3'" + assert "count" in bucket_selector["buckets_path"], "buckets_path must be 'count'"