""" FILE: app/core/parsing/parsing_utils.py DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion. """ import re from typing import Any, Dict, List, Tuple, Optional from .parsing_models import ParsedNote # Öffentliche Konstanten für Abwärtskompatibilität FRONTMATTER_RE = re.compile(r"^\s*---\s*$") _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None: """Prüft, ob alle Pflichtfelder vorhanden sind.""" if fm is None: fm = {} missing = [] for k in required: v = fm.get(k) if v is None or (isinstance(v, str) and not v.strip()): missing.append(k) if missing: raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}") if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)): raise ValueError("frontmatter 'tags' must be a list of strings") def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]: """Normalisierung von Tags und Boolean-Feldern.""" out = dict(fm or {}) if "tags" in out: if isinstance(out["tags"], str): out["tags"] = [out["tags"].strip()] if out["tags"].strip() else [] elif isinstance(out["tags"], list): out["tags"] = [str(t).strip() for t in out["tags"] if t is not None] else: out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else [] if "embedding_exclude" in out: out["embedding_exclude"] = bool(out["embedding_exclude"]) return out def extract_wikilinks(text: str) -> List[str]: """Extrahiert Wikilinks als einfache Liste von IDs.""" if not text: return [] out: List[str] = [] for m in _WIKILINK_RE.finditer(text): raw = (m.group(1) or "").strip() if not raw: continue if "|" in raw: raw = raw.split("|", 1)[0].strip() if "#" in raw: raw = raw.split("#", 1)[0].strip() if raw: out.append(raw) return out def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]: """WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry.""" edges = [] if not parsed or not parsed.body: return edges lines = parsed.body.splitlines() for line_num, line_content in enumerate(lines, 1): for match in _WIKILINK_RE.finditer(line_content): raw = (match.group(1) or "").strip() if not raw: continue if "|" in raw: parts = raw.split("|", 1) target, kind = parts[0].strip(), parts[1].strip() else: target, kind = raw.strip(), "related_to" if "#" in target: target = target.split("#", 1)[0].strip() if target: edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"}) return edges