hermes-agent/bulk_enqueue_vault.py

#!/usr/bin/env python3
"""
Bulk enqueue all Obsidian vault notes into the ingestion queue.
Does NOT process directly — the orchestrator daemon handles them.
"""
import sys
import re
from pathlib import Path
from datetime import datetime

VAULT = Path.home() / "Documents" / "empire"
SKIP_DIRS = {".obsidian", ".git", ".stfolder", ".DS_Store", "image", "2025", "2024", "2023", "2022", "Bible 2025", "Daily Logs", "Clippings", "COAH", "House-Project", "Inbox", "OSCP Work", "OpenClaw"}
SKIP_PATTERNS = ["sync-conflict", ".DS_Store"]

sys.path.insert(0, str(Path(__file__).parent))
from ingestion_orchestrator import IngestionOrchestrator

def slugify(text: str) -> str:
    return re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')

def path_to_slug(rel_path: Path) -> str:
    parts = rel_path.parts
    slug_parts = []
    for p in parts:
        if p in SKIP_DIRS and len(slug_parts) == 0:
            continue
        clean = slugify(p)
        if clean:
            slug_parts.append(clean)
    slug = "/".join(slug_parts) if len(slug_parts) > 1 else (slug_parts[0] if slug_parts else "")
    if not slug or len(slug) < 3:
        slug = rel_path.stem.lower()
    return slug

def extract_title(path: Path, text: str) -> str:
    m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
    if m:
        for line in m.group(1).split('\n'):
            if line.startswith('title:'):
                val = line.split(':', 1)[1].strip().strip('"').strip("'")
                if val:
                    return val
    body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
    m = re.search(r'^(#{1,2})\s+(.+)$', body, re.MULTILINE)
    if m:
        return m.group(2).strip()
    return path.stem.replace('-', ' ').replace('_', ' ').title()

def is_markdown(path: Path) -> bool:
    if path.suffix.lower() in ('.md', '.markdown'):
        for pat in SKIP_PATTERNS:
            if pat in path.name:
                return False
        return True
    return False

def main():
    orchestrator = IngestionOrchestrator()
    enqueued = 0
    skipped = 0

    for md_file in VAULT.rglob('*'):
        if not is_markdown(md_file):
            continue

        try:
            rel = md_file.relative_to(VAULT)
        except ValueError:
            continue

        try:
            text = md_file.read_text(encoding='utf-8', errors='ignore')
        except Exception:
            skipped += 1
            continue

        if not text.strip():
            skipped += 1
            continue

        slug = path_to_slug(rel)
        full_slug = f"obsidian/{slug}"
        title = extract_title(md_file, text)

        # Strip frontmatter from content for clean body
        body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
        # Add frontmatter with slug
        frontmatter = f'---\ntitle: "{title}"\nslug: {full_slug}\ndate: "{datetime.now().strftime("%Y-%m-%d")}"\nsource: obsidian-vault\n---\n\n'
        full_content = frontmatter + body

        job_id = orchestrator.enqueue(
            source="obsidian",
            slug=full_slug,
            title=title,
            content=full_content,
            meta={"rel_path": str(rel), "source": "bulk-import"}
        )

        if job_id:
            enqueued += 1
        else:
            skipped += 1  # duplicate

        if enqueued % 50 == 0 and enqueued > 0:
            print(f"[+] Enqueued {enqueued}...", flush=True)

    print(f"\n✓ Done! Enqueued {enqueued} notes, {skipped} skipped (duplicates/empty).", flush=True)
    print(f"  Run `~/.hermes/bin/ingestctl status` to monitor processing.", flush=True)

if __name__ == "__main__":
    main()