Files
hermes-agent/bulk_enqueue_vault.py

112 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Bulk enqueue all Obsidian vault notes into the ingestion queue.
Does NOT process directly — the orchestrator daemon handles them.
"""
import sys
import re
from pathlib import Path
from datetime import datetime
VAULT = Path.home() / "Documents" / "empire"
SKIP_DIRS = {".obsidian", ".git", ".stfolder", ".DS_Store", "image", "2025", "2024", "2023", "2022", "Bible 2025", "Daily Logs", "Clippings", "COAH", "House-Project", "Inbox", "OSCP Work", "OpenClaw"}
SKIP_PATTERNS = ["sync-conflict", ".DS_Store"]
sys.path.insert(0, str(Path(__file__).parent))
from ingestion_orchestrator import IngestionOrchestrator
def slugify(text: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
def path_to_slug(rel_path: Path) -> str:
parts = rel_path.parts
slug_parts = []
for p in parts:
if p in SKIP_DIRS and len(slug_parts) == 0:
continue
clean = slugify(p)
if clean:
slug_parts.append(clean)
slug = "/".join(slug_parts) if len(slug_parts) > 1 else (slug_parts[0] if slug_parts else "")
if not slug or len(slug) < 3:
slug = rel_path.stem.lower()
return slug
def extract_title(path: Path, text: str) -> str:
m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
if m:
for line in m.group(1).split('\n'):
if line.startswith('title:'):
val = line.split(':', 1)[1].strip().strip('"').strip("'")
if val:
return val
body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
m = re.search(r'^(#{1,2})\s+(.+)$', body, re.MULTILINE)
if m:
return m.group(2).strip()
return path.stem.replace('-', ' ').replace('_', ' ').title()
def is_markdown(path: Path) -> bool:
if path.suffix.lower() in ('.md', '.markdown'):
for pat in SKIP_PATTERNS:
if pat in path.name:
return False
return True
return False
def main():
orchestrator = IngestionOrchestrator()
enqueued = 0
skipped = 0
for md_file in VAULT.rglob('*'):
if not is_markdown(md_file):
continue
try:
rel = md_file.relative_to(VAULT)
except ValueError:
continue
try:
text = md_file.read_text(encoding='utf-8', errors='ignore')
except Exception:
skipped += 1
continue
if not text.strip():
skipped += 1
continue
slug = path_to_slug(rel)
full_slug = f"obsidian/{slug}"
title = extract_title(md_file, text)
# Strip frontmatter from content for clean body
body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
# Add frontmatter with slug
frontmatter = f'---\ntitle: "{title}"\nslug: {full_slug}\ndate: "{datetime.now().strftime("%Y-%m-%d")}"\nsource: obsidian-vault\n---\n\n'
full_content = frontmatter + body
job_id = orchestrator.enqueue(
source="obsidian",
slug=full_slug,
title=title,
content=full_content,
meta={"rel_path": str(rel), "source": "bulk-import"}
)
if job_id:
enqueued += 1
else:
skipped += 1 # duplicate
if enqueued % 50 == 0 and enqueued > 0:
print(f"[+] Enqueued {enqueued}...", flush=True)
print(f"\n✓ Done! Enqueued {enqueued} notes, {skipped} skipped (duplicates/empty).", flush=True)
print(f" Run `~/.hermes/bin/ingestctl status` to monitor processing.", flush=True)
if __name__ == "__main__":
main()