mindnet/scripts/debug_edge_loss.py

69 lines
2.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import os
import sys
from pathlib import Path
# Pfad-Setup
sys.path.insert(0, os.path.abspath("."))
from app.core.chunking import assemble_chunks, _extract_all_edges_from_md
from app.core.derive_edges import build_edges_for_note
# Mock für Settings, falls nötig
os.environ["MINDNET_LLM_MODEL"] = "phi3:mini"
async def analyze_file(file_path: str):
print(f"\n=== ANALYSE: {file_path} ===")
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
# 1. Globale Kandidaten (Was sieht der Pre-Scan?)
# Wir simulieren den Aufruf, den der Chunker macht
note_id = Path(file_path).stem
candidates = _extract_all_edges_from_md(text, note_id, "concept")
print(f"\n[1] Globale Kandidaten (Pre-Scan):")
for c in candidates:
print(f" - {c}")
# 2. Chunking (Ohne Smart Edges erstmal, um die physische Integrität zu prüfen)
# Wir nutzen ein Profil, das dem User-Setup entspricht
config = {
"strategy": "sliding_window",
"target": 400,
"max": 600,
"overlap": 50,
"enable_smart_edge_allocation": False # Erstmal aus
}
chunks = await assemble_chunks(note_id, text, "concept", config=config)
print(f"\n[2] Chunk-Struktur & Physische Kanten:")
for i, chunk in enumerate(chunks):
print(f"\n--- Chunk {i} (Section: {chunk.section_path}) ---")
print(f"Snippet: {chunk.text[:50]}...")
# Was findet derive_edges in diesem rohen Chunk?
# Wir simulieren das Payload-Dict, das derive_edges erwartet
chunk_pl = {"text": chunk.text, "window": chunk.window, "chunk_id": chunk.id}
edges = build_edges_for_note(note_id, [chunk_pl])
found_explicitly = [f"{e['kind']}:{e.get('target_id')}" for e in edges if e['rule_id'] in ['callout:edge', 'inline:rel']]
if found_explicitly:
print(f" ✅ Gefundene Explizite Kanten: {found_explicitly}")
else:
print(f" ❌ Keine expliziten Kanten gefunden (trotz Callout im Text?)")
# Check auf Callout im Text
if "> [!edge]" in chunk.text:
print(" HINWEIS: '> [!edge]' String ist im Text vorhanden!")
if __name__ == "__main__":
# Pfad zu deiner problematischen Datei hier anpassen!
target_file = "./vault_master/Dein_Problematisches_File.md"
if len(sys.argv) > 1:
target_file = sys.argv[1]
asyncio.run(analyze_file(target_file))