mindnet/app/core/parsing/parsing_utils.py

"""
FILE: app/core/parsing/parsing_utils.py
DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion.
"""
import re
from typing import Any, Dict, List, Tuple, Optional
from .parsing_models import ParsedNote

# Öffentliche Konstanten für Abwärtskompatibilität
FRONTMATTER_RE = re.compile(r"^\s*---\s*$")
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")

def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None:
    """Prüft, ob alle Pflichtfelder vorhanden sind."""
    if fm is None: fm = {}
    missing = []
    for k in required:
        v = fm.get(k)
        if v is None or (isinstance(v, str) and not v.strip()):
            missing.append(k)
    if missing:
        raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
    if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
        raise ValueError("frontmatter 'tags' must be a list of strings")

def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
    """Normalisierung von Tags und Boolean-Feldern."""
    out = dict(fm or {})
    if "tags" in out:
        if isinstance(out["tags"], str):
            out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
        elif isinstance(out["tags"], list):
            out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
        else:
            out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
    if "embedding_exclude" in out:
        out["embedding_exclude"] = bool(out["embedding_exclude"])
    return out

def extract_wikilinks(text: str) -> List[str]:
    """Extrahiert Wikilinks als einfache Liste von IDs."""
    if not text: return []
    out: List[str] = []
    for m in _WIKILINK_RE.finditer(text):
        raw = (m.group(1) or "").strip()
        if not raw: continue
        if "|" in raw: raw = raw.split("|", 1)[0].strip()
        if "#" in raw: raw = raw.split("#", 1)[0].strip()
        if raw: out.append(raw)
    return out

def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
    """WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry."""
    edges = []
    if not parsed or not parsed.body: return edges
    lines = parsed.body.splitlines()
    for line_num, line_content in enumerate(lines, 1):
        for match in _WIKILINK_RE.finditer(line_content):
            raw = (match.group(1) or "").strip()
            if not raw: continue
            if "|" in raw:
                parts = raw.split("|", 1)
                target, kind = parts[0].strip(), parts[1].strip()
            else:
                target, kind = raw.strip(), "related_to"
            if "#" in target: target = target.split("#", 1)[0].strip()
            if target:
                edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"})
    return edges