mindnet/app/core/parsing/parsing_utils.py
2025-12-27 14:26:42 +01:00

69 lines
2.9 KiB
Python

"""
FILE: app/core/parsing/parsing_utils.py
DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion.
"""
import re
from typing import Any, Dict, List, Tuple, Optional
from .parsing_models import ParsedNote
# Öffentliche Konstanten für Abwärtskompatibilität
FRONTMATTER_RE = re.compile(r"^\s*---\s*$")
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None:
"""Prüft, ob alle Pflichtfelder vorhanden sind."""
if fm is None: fm = {}
missing = []
for k in required:
v = fm.get(k)
if v is None or (isinstance(v, str) and not v.strip()):
missing.append(k)
if missing:
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
raise ValueError("frontmatter 'tags' must be a list of strings")
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
"""Normalisierung von Tags und Boolean-Feldern."""
out = dict(fm or {})
if "tags" in out:
if isinstance(out["tags"], str):
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
elif isinstance(out["tags"], list):
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
else:
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
if "embedding_exclude" in out:
out["embedding_exclude"] = bool(out["embedding_exclude"])
return out
def extract_wikilinks(text: str) -> List[str]:
"""Extrahiert Wikilinks als einfache Liste von IDs."""
if not text: return []
out: List[str] = []
for m in _WIKILINK_RE.finditer(text):
raw = (m.group(1) or "").strip()
if not raw: continue
if "|" in raw: raw = raw.split("|", 1)[0].strip()
if "#" in raw: raw = raw.split("#", 1)[0].strip()
if raw: out.append(raw)
return out
def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
"""WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry."""
edges = []
if not parsed or not parsed.body: return edges
lines = parsed.body.splitlines()
for line_num, line_content in enumerate(lines, 1):
for match in _WIKILINK_RE.finditer(line_content):
raw = (match.group(1) or "").strip()
if not raw: continue
if "|" in raw:
parts = raw.split("|", 1)
target, kind = parts[0].strip(), parts[1].strip()
else:
target, kind = raw.strip(), "related_to"
if "#" in target: target = target.split("#", 1)[0].strip()
if target:
edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"})
return edges