mindnet/scripts/import_markdown.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/import_markdown.py
VERSION: 2.6.0 (2026-01-10)
STATUS: Active (Core)
COMPATIBILITY: IngestionProcessor v3.3.7+
Zweck: Hauptwerkzeug zum Importieren von Markdown-Dateien.
       Implementiert die globale 2-Phasen-Schreibstrategie.
"""
import asyncio
import os
import argparse
import logging
import sys
from pathlib import Path
from dotenv import load_dotenv

# Root Logger Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
sys.path.append(os.getcwd())

from app.core.ingestion import IngestionService
from app.core.parser import pre_scan_markdown

logger = logging.getLogger("importer")

async def main_async(args):
    vault_path = Path(args.vault).resolve()
    if not vault_path.exists():
        logger.error(f"Vault path does not exist: {vault_path}")
        return

    logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
    service = IngestionService(collection_prefix=args.prefix)

    logger.info(f"Scanning {vault_path}...")
    all_files = list(vault_path.rglob("*.md"))

    # --- GLOBALER ORDNER-FILTER ---
    files = []
    ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
    for f in all_files:
        f_str = str(f)
        if not any(folder in f_str for folder in ignore_folders) and not "/." in f_str:
            files.append(f)

    files.sort()
    logger.info(f"Found {len(files)} relevant markdown files.")

    # =========================================================================
    # PASS 1: Global Pre-Scan
    # =========================================================================
    logger.info(f"🔍 [Pass 1] Pre-scanning files for global context cache...")
    for f_path in files:
        try:
            ctx = pre_scan_markdown(str(f_path))
            if ctx:
                service.batch_cache[ctx.note_id] = ctx
                service.batch_cache[ctx.title] = ctx
                fname = os.path.splitext(f_path.name)[0]
                service.batch_cache[fname] = ctx
        except Exception: pass

    # =========================================================================
    # PHASE 1: Batch-Import (Notes & Explicit Edges)
    # =========================================================================
    stats = {"processed": 0, "skipped": 0, "errors": 0}
    sem = asyncio.Semaphore(5)

    async def process_with_limit(f_path):
        async with sem:
            try:
                # Nutzt process_file (v3.3.7)
                return await service.process_file(
                    file_path=str(f_path), vault_root=str(vault_path),
                    force_replace=args.force, apply=args.apply, purge_before=True
                )
            except Exception as e:
                return {"status": "error", "error": str(e), "path": str(f_path)}

    batch_size = 20
    for i in range(0, len(files), batch_size):
        batch = files[i:i+batch_size]
        logger.info(f"--- Processing Batch {i//batch_size + 1} ---")
        tasks = [process_with_limit(f) for f in batch]
        results = await asyncio.gather(*tasks)
        for res in results:
            if res.get("status") == "success": stats["processed"] += 1
            elif res.get("status") == "error": stats["errors"] += 1
            else: stats["skipped"] += 1

    # =========================================================================
    # PHASE 2: Global Symmetry Injection (Nach Abschluss aller Batches)
    # =========================================================================
    if args.apply:
        logger.info(f"🔄 [Phase 2] Starting global symmetry injection for the entire vault...")
        sym_res = await service.commit_vault_symmetries()
        if sym_res.get("status") == "success":
            logger.info(f"✅ Finished global symmetry injection. Added: {sym_res.get('added', 0)}")

    logger.info(f"Final Stats: {stats}")

def main():
    load_dotenv()
    default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
    parser = argparse.ArgumentParser()
    parser.add_argument("--vault", default="./vault")
    parser.add_argument("--prefix", default=default_prefix)
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--apply", action="store_true")
    args = parser.parse_args()
    try:
        asyncio.run(main_async(args))
    except Exception as e:
        logger.critical(f"FATAL ERROR: {e}")

if __name__ == "__main__":
    main()