69 lines
2.9 KiB
Python
69 lines
2.9 KiB
Python
"""
|
|
FILE: app/core/parsing/parsing_utils.py
|
|
DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion.
|
|
"""
|
|
import re
|
|
from typing import Any, Dict, List, Tuple, Optional
|
|
from .parsing_models import ParsedNote
|
|
|
|
# Öffentliche Konstanten für Abwärtskompatibilität
|
|
FRONTMATTER_RE = re.compile(r"^\s*---\s*$")
|
|
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
|
|
def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None:
|
|
"""Prüft, ob alle Pflichtfelder vorhanden sind."""
|
|
if fm is None: fm = {}
|
|
missing = []
|
|
for k in required:
|
|
v = fm.get(k)
|
|
if v is None or (isinstance(v, str) and not v.strip()):
|
|
missing.append(k)
|
|
if missing:
|
|
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
|
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
|
|
raise ValueError("frontmatter 'tags' must be a list of strings")
|
|
|
|
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Normalisierung von Tags und Boolean-Feldern."""
|
|
out = dict(fm or {})
|
|
if "tags" in out:
|
|
if isinstance(out["tags"], str):
|
|
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
|
|
elif isinstance(out["tags"], list):
|
|
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
|
|
else:
|
|
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
|
|
if "embedding_exclude" in out:
|
|
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
|
return out
|
|
|
|
def extract_wikilinks(text: str) -> List[str]:
|
|
"""Extrahiert Wikilinks als einfache Liste von IDs."""
|
|
if not text: return []
|
|
out: List[str] = []
|
|
for m in _WIKILINK_RE.finditer(text):
|
|
raw = (m.group(1) or "").strip()
|
|
if not raw: continue
|
|
if "|" in raw: raw = raw.split("|", 1)[0].strip()
|
|
if "#" in raw: raw = raw.split("#", 1)[0].strip()
|
|
if raw: out.append(raw)
|
|
return out
|
|
|
|
def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
|
|
"""WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry."""
|
|
edges = []
|
|
if not parsed or not parsed.body: return edges
|
|
lines = parsed.body.splitlines()
|
|
for line_num, line_content in enumerate(lines, 1):
|
|
for match in _WIKILINK_RE.finditer(line_content):
|
|
raw = (match.group(1) or "").strip()
|
|
if not raw: continue
|
|
if "|" in raw:
|
|
parts = raw.split("|", 1)
|
|
target, kind = parts[0].strip(), parts[1].strip()
|
|
else:
|
|
target, kind = raw.strip(), "related_to"
|
|
if "#" in target: target = target.split("#", 1)[0].strip()
|
|
if target:
|
|
edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"})
|
|
return edges |