blue-team-tools/tests/reference-archiver.py

# Author:
#    Martin Spielmann / KION Group IT
#    Nasreddine Bencherchali / Nextron Systems

__version__ = "0.0.1"

import time
import requests
import yaml
import os
from datetime import datetime
from typing import Generator


WEB_ARCHIVE_SAVE_URL = "https://web.archive.org/save/"
WEB_ARCHIVE_GET_URL = "https://web.archive.org/web/"

with open("tests/rule-references.txt", "r") as f:
    RULE_REFERENCES = [i.strip() for i in f.readlines()]

path_to_rules = [
    "rules",
    "rules-emerging-threats",
    "rules-placeholder",
    "rules-threat-hunting",
    "rules-compliance",
]


# Helper functions
def yield_next_rule_file_path(path_to_rules: list) -> Generator[str, None, None]:
    for path_ in path_to_rules:
        for root, _, files in os.walk(path_):
            for file in files:
                if file.endswith(".yml"):
                    yield os.path.join(root, file)


def get_rule_part(file_path: str, part_name: str):
    yaml_dicts = get_rule_yaml(file_path)
    for yaml_part in yaml_dicts:
        if part_name in yaml_part.keys():
            return yaml_part[part_name]

    return None


def get_rule_yaml(file_path: str) -> dict:
    data = []

    with open(file_path, encoding="utf-8") as f:
        yaml_parts = yaml.safe_load_all(f)
        for part in yaml_parts:
            data.append(part)

    return data


def get_references(path_to_rules):
    ref_list = []

    for file in yield_next_rule_file_path(path_to_rules):
        references = get_rule_part(file_path=file, part_name="references")
        if references:
            for ref in references:
                # To avoid references using "Internal Research" or similar
                if ref.startswith("http"):
                    ref_list.append(ref)
    return ref_list


def archive_references(ref_list):
    error_archiving = []
    already_archived = []
    newly_archived_references = []

    for ref in ref_list:
        try:
            archive_response = requests.get(url=WEB_ARCHIVE_GET_URL + ref)
            # If the URL is not yet archived, the Wayback Machine returns a 404 response
            status_code = archive_response.status_code
            if status_code in (200, 301, 302):
                # Already archived
                already_archived.append(ref)
                print("Reference '{}' is already archived".format(ref))
            elif status_code == 403:
                # Wayback machine does not have permission to access the reference.
                error_archiving.append(ref)
                print(
                    "Wayback Machine got permission denied in the past, when trying to access reference '{}'. Not archiving.".format(
                        ref
                    )
                )
            else:
                print("Reference '{}' is not archived. Archiving...".format(ref))
                archive_response = requests.post(url=WEB_ARCHIVE_SAVE_URL + ref)
                newly_archived_references.append(ref)

            # We sleep so we don't spam the Wayback Machine too much :)
            time.sleep(1)
        except:
            error_archiving.append(ref)

    return already_archived, newly_archived_references, error_archiving


def sort_references(file_path: str):
    """Sort the references in the rule-references.txt file alphabetically."""
    try:
        with open(file_path, "r") as f:
            references = [line.strip() for line in f.readlines() if line.strip()]

        # Sort references alphabetically (case-insensitive)
        references.sort(key=str.lower)

        # Write the sorted references back to the file
        with open(file_path, "w") as f:
            for ref in references:
                f.write(ref + "\n")

        print("References sorted successfully.")

    except Exception as e:
        print(f"Error sorting references: {e}")


if __name__ == "__main__":
    print("Archiving references ...\n")

    tmp_ref_list = get_references(path_to_rules)

    # We do an intersection between the full list and the list of references that are already archived
    ref_list = list(set(tmp_ref_list) - set(RULE_REFERENCES))

    already_archived, newly_archived_references, error_archiving = archive_references(
        ref_list
    )

    with open("tests/rule-references.txt", "a") as f:
        for ref in already_archived:
            f.write(ref)
            f.write("\n")

        for ref in newly_archived_references:
            f.write(ref)
            f.write("\n")

    # Sort the references alphabetically at the end
    print("Sorting references...")
    sort_references("tests/rule-references.txt")

    # Write markdown output to open the issue
    with open(".github/latest_archiver_output.md", "w") as f:
        f.write(f"# Reference Archiver Results\n\n")
        f.write(f"Last Execution: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write("### Archiver Script Results\n\n")
        f.write("\n#### Newly Archived References\n\n")
        if newly_archived_references:
            for ref in newly_archived_references:
                f.write(f"- {ref}\n")
        else:
            f.write("N/A\n")

        f.write("\n#### Already Archived References\n\n")
        if already_archived:
            for ref in already_archived:
                f.write(f"- {ref}\n")
        else:
            f.write("N/A\n")

        f.write("\n#### Error While Archiving References\n\n")
        if error_archiving:
            for ref in error_archiving:
                f.write(f"- {ref}\n")
        else:
            f.write("N/A\n")

    print("\nDone.")