mindnet/scripts/debug_edge_loss.py

import asyncio
import os
import sys
from pathlib import Path

# Pfad-Setup
sys.path.insert(0, os.path.abspath("."))

from app.core.chunking import assemble_chunks, _extract_all_edges_from_md
from app.core.derive_edges import build_edges_for_note

# Mock für Settings, falls nötig
os.environ["MINDNET_LLM_MODEL"] = "phi3:mini"

async def analyze_file(file_path: str):
    print(f"\n=== ANALYSE: {file_path} ===")

    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # 1. Globale Kandidaten (Was sieht der Pre-Scan?)
    # Wir simulieren den Aufruf, den der Chunker macht
    note_id = Path(file_path).stem
    candidates = _extract_all_edges_from_md(text, note_id, "concept")
    print(f"\n[1] Globale Kandidaten (Pre-Scan):")
    for c in candidates:
        print(f"  - {c}")

    # 2. Chunking (Ohne Smart Edges erstmal, um die physische Integrität zu prüfen)
    # Wir nutzen ein Profil, das dem User-Setup entspricht
    config = {
        "strategy": "sliding_window",
        "target": 400,
        "max": 600,
        "overlap": 50,
        "enable_smart_edge_allocation": False # Erstmal aus
    }

    chunks = await assemble_chunks(note_id, text, "concept", config=config)

    print(f"\n[2] Chunk-Struktur & Physische Kanten:")
    for i, chunk in enumerate(chunks):
        print(f"\n--- Chunk {i} (Section: {chunk.section_path}) ---")
        print(f"Snippet: {chunk.text[:50]}...")

        # Was findet derive_edges in diesem rohen Chunk?
        # Wir simulieren das Payload-Dict, das derive_edges erwartet
        chunk_pl = {"text": chunk.text, "window": chunk.window, "chunk_id": chunk.id}
        edges = build_edges_for_note(note_id, [chunk_pl])

        found_explicitly = [f"{e['kind']}:{e.get('target_id')}" for e in edges if e['rule_id'] in ['callout:edge', 'inline:rel']]

        if found_explicitly:
            print(f"  ✅ Gefundene Explizite Kanten: {found_explicitly}")
        else:
            print(f"  ❌ Keine expliziten Kanten gefunden (trotz Callout im Text?)")

        # Check auf Callout im Text
        if "> [!edge]" in chunk.text:
            print("  ℹ️  HINWEIS: '> [!edge]' String ist im Text vorhanden!")

if __name__ == "__main__":
    # Pfad zu deiner problematischen Datei hier anpassen!
    target_file = "./vault_master/Dein_Problematisches_File.md"

    if len(sys.argv) > 1:
        target_file = sys.argv[1]

    asyncio.run(analyze_file(target_file))