mindnet/scripts/edges_dryrun.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/edges_dryrun.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Simuliert die Edge-Erzeugung aus Markdown-Dateien ohne Datenbank-Upsert.
Nützlich zur Analyse, welche Kanten aus einem Vault generiert würden.

Funktionsweise:
---------------
1. Scannt alle .md-Dateien im Vault
2. Für jede Datei:
   - Parst Markdown mit Frontmatter
   - Erstellt einfache Absatz-basierte Chunks (vereinfachtes Chunking)
   - Extrahiert Edge-Kandidaten via build_edges_for_note():
     * Wikilinks [[...]]
     * Typed Relations [[rel:KIND|Target]]
     * Callout Relations [!edge] KIND: [[Target]]
     * Struktur-Kanten (belongs_to, next, prev)
   - Zählt Edges nach Typ (kind/relation)
3. Gibt JSON-Report aus mit Edge-Statistiken pro Note

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON-Array mit einem Objekt pro Note
- Jedes Objekt enthält:
  * path: Dateipfad
  * note_id: Note-Identifier
  * type: Note-Typ
  * chunks: Anzahl der Chunks
  * edges_total: Gesamtanzahl der Edges
  * edges_by_kind: Dictionary mit Edge-Anzahl pro Typ
  * samples: Erste 3 Edges als Beispiele

Verwendung:
-----------
- Analyse der Graph-Struktur vor dem Import
- Debugging von fehlenden oder unerwarteten Edges
- Validierung der Edge-Extraktion-Logik

Hinweise:
---------
- Verwendet vereinfachtes Chunking (Absatz-basiert), nicht das vollständige Chunking-System
- Edge-Extraktion entspricht der Produktions-Logik (app.core.derive_edges)
- Keine Datenbank-Operationen, rein analytisch

Aufruf:
-------
python3 -m scripts.edges_dryrun --vault ./vault
python3 -m scripts.edges_dryrun --vault ./vault --include-note-scope-refs

Parameter:
----------
--vault PATH              Pfad zum Vault-Verzeichnis (erforderlich)
--include-note-scope-refs  Berücksichtigt auch Frontmatter-Links (links[].target_id)

Änderungen:
-----------
v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung
  - Geändert: app.core.edges → app.core.derive_edges
  - Parameter korrigiert: chunk_payloads → chunks, note_level_refs → note_level_references
v1.0.0: Erster Release
"""
from __future__ import annotations

import argparse
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional

from app.core.parser import read_markdown, normalize_frontmatter
from app.core.derive_edges import build_edges_for_note

def _iter_markdown(vault: str):
    for p in Path(vault).rglob("*.md"):
        if p.name.startswith("."):  # ignore hidden
            continue
        yield p

def _simple_chunker(body: str, note_id: str, note_type: str) -> List[Dict]:
    # Absatzbasiert, minimal ausreichend für Edges (window/text, chunk_id, ord, note_id, type)
    paras = [s.strip() for s in re.split(r"\n{2,}", body or "") if s.strip()]
    chunks = []
    for i, t in enumerate(paras):
        chunks.append({
            "chunk_id": f"{note_id}#c{i:04d}",
            "ord": i,
            "text": t,
            "note_id": note_id,
            "type": note_type,
        })
    return chunks

def _fm_note_refs(fm: Dict) -> List[str]:
    out = []
    links = fm.get("links") or []
    if isinstance(links, list):
        for e in links:
            if isinstance(e, dict):
                tid = e.get("target_id")
                if isinstance(tid, str) and tid.strip():
                    out.append(tid.strip())
    return out

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--vault", required=True)
    ap.add_argument("--include-note-scope-refs", action="store_true")
    args = ap.parse_args()

    vault = args.vault
    include_note_scope = args.include_note_scope_refs

    report = []
    for p in _iter_markdown(vault):
        parsed = read_markdown(str(p))
        if not parsed:
            continue
        fm = normalize_frontmatter(parsed.frontmatter or {})
        note_id = fm.get("id") or p.stem
        note_type = (fm.get("type") or "concept").lower()
        chunks = _simple_chunker(parsed.body, note_id, note_type)
        note_refs = _fm_note_refs(fm)

        edges = build_edges_for_note(
            note_id=note_id,
            chunks=chunks,
            note_level_references=note_refs,
            include_note_scope_refs=include_note_scope,
        )
        kinds = {}
        for e in edges:
            key = (e.get("relation") or e.get("kind") or "edge")
            kinds[key] = kinds.get(key, 0) + 1
        report.append({
            "path": str(p),
            "note_id": note_id,
            "type": note_type,
            "chunks": len(chunks),
            "edges_total": len(edges),
            "edges_by_kind": kinds,
            "samples": edges[:3],
        })

    print(json.dumps(report, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()