mindnet/scripts/import_markdown.py

#!/usr/bin/env python3
"""
scripts/import_markdown.py
CLI-Tool zum Importieren von Markdown-Dateien in Qdrant.
WP-15b: Implementiert den Two-Pass Workflow (Pre-Scan + Processing).
Sorgt dafür, dass der LocalBatchCache vor der Verarbeitung robust gefüllt wird.
Indiziert Notizen nach ID, Titel und Dateiname für maximale Link-Kompatibilität.
VERSION: 2.4.1
"""
import asyncio
import os
import argparse
import logging
from pathlib import Path
from dotenv import load_dotenv

# Setzt das Level global auf INFO, damit der Fortschritt im Log sichtbar ist
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

# Importiere den neuen Async Service und stelle Python-Pfad sicher
import sys
sys.path.append(os.getcwd())

from app.core.ingestion import IngestionService
from app.core.parser import pre_scan_markdown

logger = logging.getLogger("importer")

async def main_async(args):
    vault_path = Path(args.vault).resolve()
    if not vault_path.exists():
        logger.error(f"Vault path does not exist: {vault_path}")
        return

    # 1. Service initialisieren
    logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
    service = IngestionService(collection_prefix=args.prefix)

    logger.info(f"Scanning {vault_path}...")
    files = list(vault_path.rglob("*.md"))
    # Exclude .obsidian folder if present
    files = [f for f in files if ".obsidian" not in str(f)]
    files.sort()

    logger.info(f"Found {len(files)} markdown files.")

    # =========================================================================
    # PASS 1: Global Pre-Scan (WP-15b Harvester)
    # Füllt den LocalBatchCache für die semantische Kanten-Validierung.
    # Nutzt ID, Titel und Filename für robusten Look-up.
    # =========================================================================
    logger.info(f"🔍 [Pass 1] Pre-scanning {len(files)} files for global context cache...")
    for f_path in files:
        try:
            ctx = pre_scan_markdown(str(f_path))
            if ctx:
                # 1. Look-up via Note ID (UUID oder Frontmatter ID)
                service.batch_cache[ctx.note_id] = ctx

                # 2. Look-up via Titel (Wichtig für Wikilinks [[Titel]])
                service.batch_cache[ctx.title] = ctx

                # 3. Look-up via Dateiname (Wichtig für Wikilinks [[Filename]])
                fname = os.path.splitext(f_path.name)[0]
                service.batch_cache[fname] = ctx

        except Exception as e:
            logger.warning(f"⚠️ Could not pre-scan {f_path.name}: {e}")

    logger.info(f"✅ Context Cache populated for {len(files)} notes.")

    # =========================================================================
    # PASS 2: Processing (Semantic Batch-Verarbeitung)
    # Nutzt den gefüllten Cache zur binären Validierung semantischer Kanten.
    # =========================================================================
    stats = {"processed": 0, "skipped": 0, "errors": 0}
    sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Cloud-Stabilität

    async def process_with_limit(f_path):
        async with sem:
            try:
                # Nutzt den nun gefüllten Batch-Cache in der process_file Logik
                res = await service.process_file(
                    file_path=str(f_path),
                    vault_root=str(vault_path),
                    force_replace=args.force,
                    apply=args.apply,
                    purge_before=True
                )
                return res
            except Exception as e:
                return {"status": "error", "error": str(e), "path": str(f_path)}

    logger.info(f"🚀 [Pass 2] Starting semantic processing in batches...")

    batch_size = 20
    for i in range(0, len(files), batch_size):
        batch = files[i:i+batch_size]
        logger.info(f"Processing batch {i} to {i+len(batch)}...")

        tasks = [process_with_limit(f) for f in batch]
        results = await asyncio.gather(*tasks)

        for res in results:
            if res.get("status") == "success":
                stats["processed"] += 1
            elif res.get("status") == "error":
                stats["errors"] += 1
                logger.error(f"Error in {res.get('path')}: {res.get('error')}")
            else:
                stats["skipped"] += 1

    logger.info(f"Done. Stats: {stats}")
    if not args.apply:
        logger.info("DRY RUN. Use --apply to write to DB.")

def main():
    load_dotenv()
    default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")

    parser = argparse.ArgumentParser(description="Two-Pass Markdown Ingestion for Mindnet")
    parser.add_argument("--vault", default="./vault", help="Path to vault root")
    parser.add_argument("--prefix", default=default_prefix, help="Collection prefix")
    parser.add_argument("--force", action="store_true", help="Force re-index all files")
    parser.add_argument("--apply", action="store_true", help="Perform writes to Qdrant")

    args = parser.parse_args()

    # Starte den asynchronen Haupt-Loop
    asyncio.run(main_async(args))

if __name__ == "__main__":
    main()