WP20 - parser

2025-12-23 14:38:27 +01:00 · 2025-12-23 14:38:27 +01:00 · 0ac8a14ea7
commit 0ac8a14ea7
parent 234949800b
1 changed files with 48 additions and 24 deletions
--- a/app/core/parser.py
+++ b/app/core/parser.py
@ -1,10 +1,11 @@
 """
 FILE: app/core/parser.py
 DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
-VERSION: 1.7.1
+             WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
 VERSION: 1.8.0
 STATUS: Active
 DEPENDENCIES: yaml, re, dataclasses, json, io, os
-LAST_ANALYSIS: 2025-12-15
+LAST_ANALYSIS: 2025-12-23
 """
 from __future__ import annotations
@ -138,13 +139,7 @@ def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
 def read_markdown(path: str) -> Optional[ParsedNote]:
    """
-    Liest eine Markdown-Datei fehlertolerant:
+    Liest eine Markdown-Datei fehlertolerant.
    - Erlaubt verschiedene Encodings (UTF-8 bevorzugt, cp1252/latin-1 als Fallback).
    - Schlägt NICHT mit UnicodeDecodeError fehl.
    - Gibt ParsedNote(frontmatter, body, path) zurück oder None, falls die Datei nicht existiert.
    Bei Decoding-Fallback wird ein JSON-Warnhinweis geloggt:
        {"path": "...", "warn": "encoding_fallback_used", "used": "cp1252"}
    """
    if not os.path.exists(path):
        return None
@ -161,10 +156,6 @@ def validate_required_frontmatter(fm: Dict[str, Any],
                                  required: Tuple[str, ...] = ("id", "title")) -> None:
    """
    Prüft, ob alle Pflichtfelder vorhanden sind.
    Default-kompatibel: ('id', 'title'), kann aber vom Aufrufer erweitert werden, z. B.:
        validate_required_frontmatter(fm, required=("id","title","type","status","created"))
    Hebt ValueError, falls Felder fehlen oder leer sind.
    """
    if fm is None:
        fm = {}
@ -178,17 +169,13 @@ def validate_required_frontmatter(fm: Dict[str, Any],
    if missing:
        raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
    # Plausibilitäten: 'tags' sollte eine Liste sein, wenn vorhanden
    if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
        raise ValueError("frontmatter 'tags' must be a list of strings")
 def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
    """
-    Sanfte Normalisierung ohne Semantikänderung:
+    Normalisierung von Tags und anderen Feldern.
    - 'tags' → Liste von Strings (Trim)
    - 'embedding_exclude' → bool
    - andere Felder unverändert
    """
    out = dict(fm or {})
    if "tags" in out:
@ -205,15 +192,12 @@ def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
 # ------------------------------ Wikilinks ---------------------------- #
 # Basismuster für [[...]]; die Normalisierung (id vor '#', vor '|') macht extract_wikilinks
 _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
 def extract_wikilinks(text: str) -> List[str]:
    """
-    Extrahiert Wikilinks wie [[id]], [[id#anchor]], [[id|label]], [[id#anchor|label]].
+    Extrahiert Wikilinks als einfache Liste von IDs.
    Rückgabe sind NUR die Ziel-IDs (ohne Anchor/Label), führend/folgend getrimmt.
    Keine aggressive Slug-Normalisierung (die kann später im Resolver erfolgen).
    """
    if not text:
        return []
@ -222,12 +206,52 @@ def extract_wikilinks(text: str) -> List[str]:
        raw = (m.group(1) or "").strip()
        if not raw:
            continue
        # Split an Pipe (Label) → links vor '|'
        if "|" in raw:
            raw = raw.split("|", 1)[0].strip()
        # Split an Anchor
        if "#" in raw:
            raw = raw.split("#", 1)[0].strip()
        if raw:
            out.append(raw)
    return out
 def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
    """
    WP-22: Extrahiert Wikilinks [[Ziel|Typ]] aus dem Body und speichert die Zeilennummer.
    Gibt eine Liste von Dictionaries zurück, die direkt von der Ingestion verarbeitet werden können.
    """
    edges = []
    if not parsed or not parsed.body:
        return edges
    # Wir nutzen splitlines(True), um Zeilenumbrüche für die Positionsberechnung zu erhalten,
    # oder einfaches splitlines() für die reine Zeilennummerierung.
    lines = parsed.body.splitlines()
    for line_num, line_content in enumerate(lines, 1):
        for match in _WIKILINK_RE.finditer(line_content):
            raw = (match.group(1) or "").strip()
            if not raw:
                continue
            # Syntax: [[Ziel|Typ]]
            if "|" in raw:
                parts = raw.split("|", 1)
                target = parts[0].strip()
                kind = parts[1].strip()
            else:
                target = raw.strip()
                kind = "related_to" # Default-Typ
            # Anchor (#) entfernen, da Relationen auf Notiz-Ebene (ID) basieren
            if "#" in target:
                target = target.split("#", 1)[0].strip()
            if target:
                edges.append({
                    "to": target,
                    "kind": kind,
                    "line": line_num,
                    "provenance": "explicit"
                })
    return edges