112 lines
3.6 KiB
Python
112 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bulk enqueue all Obsidian vault notes into the ingestion queue.
|
|
Does NOT process directly — the orchestrator daemon handles them.
|
|
"""
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
VAULT = Path.home() / "Documents" / "empire"
|
|
SKIP_DIRS = {".obsidian", ".git", ".stfolder", ".DS_Store", "image", "2025", "2024", "2023", "2022", "Bible 2025", "Daily Logs", "Clippings", "COAH", "House-Project", "Inbox", "OSCP Work", "OpenClaw"}
|
|
SKIP_PATTERNS = ["sync-conflict", ".DS_Store"]
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from ingestion_orchestrator import IngestionOrchestrator
|
|
|
|
def slugify(text: str) -> str:
|
|
return re.sub(r'[^a-z0-9]+', '-', text.lower()).strip('-')
|
|
|
|
def path_to_slug(rel_path: Path) -> str:
|
|
parts = rel_path.parts
|
|
slug_parts = []
|
|
for p in parts:
|
|
if p in SKIP_DIRS and len(slug_parts) == 0:
|
|
continue
|
|
clean = slugify(p)
|
|
if clean:
|
|
slug_parts.append(clean)
|
|
slug = "/".join(slug_parts) if len(slug_parts) > 1 else (slug_parts[0] if slug_parts else "")
|
|
if not slug or len(slug) < 3:
|
|
slug = rel_path.stem.lower()
|
|
return slug
|
|
|
|
def extract_title(path: Path, text: str) -> str:
|
|
m = re.match(r'^---\s*\n(.*?)\n---', text, re.DOTALL)
|
|
if m:
|
|
for line in m.group(1).split('\n'):
|
|
if line.startswith('title:'):
|
|
val = line.split(':', 1)[1].strip().strip('"').strip("'")
|
|
if val:
|
|
return val
|
|
body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
|
|
m = re.search(r'^(#{1,2})\s+(.+)$', body, re.MULTILINE)
|
|
if m:
|
|
return m.group(2).strip()
|
|
return path.stem.replace('-', ' ').replace('_', ' ').title()
|
|
|
|
def is_markdown(path: Path) -> bool:
|
|
if path.suffix.lower() in ('.md', '.markdown'):
|
|
for pat in SKIP_PATTERNS:
|
|
if pat in path.name:
|
|
return False
|
|
return True
|
|
return False
|
|
|
|
def main():
|
|
orchestrator = IngestionOrchestrator()
|
|
enqueued = 0
|
|
skipped = 0
|
|
|
|
for md_file in VAULT.rglob('*'):
|
|
if not is_markdown(md_file):
|
|
continue
|
|
|
|
try:
|
|
rel = md_file.relative_to(VAULT)
|
|
except ValueError:
|
|
continue
|
|
|
|
try:
|
|
text = md_file.read_text(encoding='utf-8', errors='ignore')
|
|
except Exception:
|
|
skipped += 1
|
|
continue
|
|
|
|
if not text.strip():
|
|
skipped += 1
|
|
continue
|
|
|
|
slug = path_to_slug(rel)
|
|
full_slug = f"obsidian/{slug}"
|
|
title = extract_title(md_file, text)
|
|
|
|
# Strip frontmatter from content for clean body
|
|
body = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL)
|
|
# Add frontmatter with slug
|
|
frontmatter = f'---\ntitle: "{title}"\nslug: {full_slug}\ndate: "{datetime.now().strftime("%Y-%m-%d")}"\nsource: obsidian-vault\n---\n\n'
|
|
full_content = frontmatter + body
|
|
|
|
job_id = orchestrator.enqueue(
|
|
source="obsidian",
|
|
slug=full_slug,
|
|
title=title,
|
|
content=full_content,
|
|
meta={"rel_path": str(rel), "source": "bulk-import"}
|
|
)
|
|
|
|
if job_id:
|
|
enqueued += 1
|
|
else:
|
|
skipped += 1 # duplicate
|
|
|
|
if enqueued % 50 == 0 and enqueued > 0:
|
|
print(f"[+] Enqueued {enqueued}...", flush=True)
|
|
|
|
print(f"\n✓ Done! Enqueued {enqueued} notes, {skipped} skipped (duplicates/empty).", flush=True)
|
|
print(f" Run `~/.hermes/bin/ingestctl status` to monitor processing.", flush=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|