Files
blue-team-tools/tests/reference-archiver.py
Nasreddine Bencherchali a77d3bae4b
Goodlog Tests / check-baseline-win7 (push) Waiting to run
Goodlog Tests / check-baseline-win10 (push) Waiting to run
Goodlog Tests / check-baseline-win11 (push) Waiting to run
Goodlog Tests / check-baseline-win11-2023 (push) Waiting to run
Goodlog Tests / check-baseline-win2022 (push) Waiting to run
Goodlog Tests / check-baseline-win2022-domain-controller (push) Waiting to run
Goodlog Tests / check-baseline-win2022-0-20348-azure (push) Waiting to run
Create Release / Create Release (push) Waiting to run
Sigma Rule Tests / yamllint (push) Waiting to run
Sigma Rule Tests / test-sigma-logsource (push) Blocked by required conditions
Sigma Rule Tests / test-sigma-legacy (push) Blocked by required conditions
Sigma Rule Tests / sigma-check (push) Blocked by required conditions
Validate Sigma rules / sigma-rules-validator (push) Waiting to run
Merge PR #5708 from @nasbench - Multiple updates and issue fixes
fix: Turla Group Commands May 2020 - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Potential Dtrack RAT Activity - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Potential Data Exfiltration Activity Via CommandLine Tools - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Suspicious Network Command - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Suspicious SYSTEM User Process Creation - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Potential Snatch Ransomware Activity - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Potential Devil Bait Malware Reconnaissance - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Mint Sandstorm - AsperaFaspex Suspicious Process Execution - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
fix: Mint Sandstorm - ManageEngine Suspicious Process Execution - Change the commandline to regex to account for additional spaces when ingesting non XML version of logs from the eventlog.
update: Powershell Token Obfuscation - Powershell - Move to the TH folder in order to set the right FP expectations.
fix: Kerberoasting Activity - Initial Query - Fix issue with filter names and logic
chore: add sorting to the rule archiver script


---------

Thanks: KingKDot
Thanks: zambomarcell
Thanks: Koifman
Co-authored-by: phantinuss <79651203+phantinuss@users.noreply.github.com>
2025-10-29 11:45:19 +01:00

179 lines
5.5 KiB
Python

# Author:
# Martin Spielmann / KION Group IT
# Nasreddine Bencherchali / Nextron Systems
__version__ = "0.0.1"
import time
import requests
import yaml
import os
from datetime import datetime
from typing import Generator
WEB_ARCHIVE_SAVE_URL = "https://web.archive.org/save/"
WEB_ARCHIVE_GET_URL = "https://web.archive.org/web/"
with open("tests/rule-references.txt", "r") as f:
RULE_REFERENCES = [i.strip() for i in f.readlines()]
path_to_rules = [
"rules",
"rules-emerging-threats",
"rules-placeholder",
"rules-threat-hunting",
"rules-compliance",
]
# Helper functions
def yield_next_rule_file_path(path_to_rules: list) -> Generator[str, None, None]:
for path_ in path_to_rules:
for root, _, files in os.walk(path_):
for file in files:
if file.endswith(".yml"):
yield os.path.join(root, file)
def get_rule_part(file_path: str, part_name: str):
yaml_dicts = get_rule_yaml(file_path)
for yaml_part in yaml_dicts:
if part_name in yaml_part.keys():
return yaml_part[part_name]
return None
def get_rule_yaml(file_path: str) -> dict:
data = []
with open(file_path, encoding="utf-8") as f:
yaml_parts = yaml.safe_load_all(f)
for part in yaml_parts:
data.append(part)
return data
def get_references(path_to_rules):
ref_list = []
for file in yield_next_rule_file_path(path_to_rules):
references = get_rule_part(file_path=file, part_name="references")
if references:
for ref in references:
# To avoid references using "Internal Research" or similar
if ref.startswith("http"):
ref_list.append(ref)
return ref_list
def archive_references(ref_list):
error_archiving = []
already_archived = []
newly_archived_references = []
for ref in ref_list:
try:
archive_response = requests.get(url=WEB_ARCHIVE_GET_URL + ref)
# If the URL is not yet archived, the Wayback Machine returns a 404 response
status_code = archive_response.status_code
if status_code in (200, 301, 302):
# Already archived
already_archived.append(ref)
print("Reference '{}' is already archived".format(ref))
elif status_code == 403:
# Wayback machine does not have permission to access the reference.
error_archiving.append(ref)
print(
"Wayback Machine got permission denied in the past, when trying to access reference '{}'. Not archiving.".format(
ref
)
)
else:
print("Reference '{}' is not archived. Archiving...".format(ref))
archive_response = requests.post(url=WEB_ARCHIVE_SAVE_URL + ref)
newly_archived_references.append(ref)
# We sleep so we don't spam the Wayback Machine too much :)
time.sleep(1)
except:
error_archiving.append(ref)
return already_archived, newly_archived_references, error_archiving
def sort_references(file_path: str):
"""Sort the references in the rule-references.txt file alphabetically."""
try:
with open(file_path, "r") as f:
references = [line.strip() for line in f.readlines() if line.strip()]
# Sort references alphabetically (case-insensitive)
references.sort(key=str.lower)
# Write the sorted references back to the file
with open(file_path, "w") as f:
for ref in references:
f.write(ref + "\n")
print("References sorted successfully.")
except Exception as e:
print(f"Error sorting references: {e}")
if __name__ == "__main__":
print("Archiving references ...\n")
tmp_ref_list = get_references(path_to_rules)
# We do an intersection between the full list and the list of references that are already archived
ref_list = list(set(tmp_ref_list) - set(RULE_REFERENCES))
already_archived, newly_archived_references, error_archiving = archive_references(
ref_list
)
with open("tests/rule-references.txt", "a") as f:
for ref in already_archived:
f.write(ref)
f.write("\n")
for ref in newly_archived_references:
f.write(ref)
f.write("\n")
# Sort the references alphabetically at the end
print("Sorting references...")
sort_references("tests/rule-references.txt")
# Write markdown output to open the issue
with open(".github/latest_archiver_output.md", "w") as f:
f.write(f"# Reference Archiver Results\n\n")
f.write(f"Last Execution: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write("### Archiver Script Results\n\n")
f.write("\n#### Newly Archived References\n\n")
if newly_archived_references:
for ref in newly_archived_references:
f.write(f"- {ref}\n")
else:
f.write("N/A\n")
f.write("\n#### Already Archived References\n\n")
if already_archived:
for ref in already_archived:
f.write(f"- {ref}\n")
else:
f.write("N/A\n")
f.write("\n#### Error While Archiving References\n\n")
if error_archiving:
for ref in error_archiving:
f.write(f"- {ref}\n")
else:
f.write("N/A\n")
print("\nDone.")