From 0ac8a14ea7b3e1edf41eaff344b55ede5511e45a Mon Sep 17 00:00:00 2001
From: Lars <Lars@stommer.de>
Date: Tue, 23 Dec 2025 14:38:27 +0100
Subject: [PATCH] WP20 - parser

---
 app/core/parser.py | 72 ++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/app/core/parser.py b/app/core/parser.py
index 9d106e8..b47aeb7 100644
--- a/app/core/parser.py
+++ b/app/core/parser.py
@@ -1,10 +1,11 @@
 """
 FILE: app/core/parser.py
 DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
-VERSION: 1.7.1
+             WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
+VERSION: 1.8.0
 STATUS: Active
 DEPENDENCIES: yaml, re, dataclasses, json, io, os
-LAST_ANALYSIS: 2025-12-15
+LAST_ANALYSIS: 2025-12-23
 """
 from __future__ import annotations
 
@@ -138,13 +139,7 @@ def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
 
 def read_markdown(path: str) -> Optional[ParsedNote]:
     """
-    Liest eine Markdown-Datei fehlertolerant:
-    - Erlaubt verschiedene Encodings (UTF-8 bevorzugt, cp1252/latin-1 als Fallback).
-    - Schlägt NICHT mit UnicodeDecodeError fehl.
-    - Gibt ParsedNote(frontmatter, body, path) zurück oder None, falls die Datei nicht existiert.
-
-    Bei Decoding-Fallback wird ein JSON-Warnhinweis geloggt:
-        {"path": "...", "warn": "encoding_fallback_used", "used": "cp1252"}
+    Liest eine Markdown-Datei fehlertolerant.
     """
     if not os.path.exists(path):
         return None
@@ -161,10 +156,6 @@ def validate_required_frontmatter(fm: Dict[str, Any],
                                   required: Tuple[str, ...] = ("id", "title")) -> None:
     """
     Prüft, ob alle Pflichtfelder vorhanden sind.
-    Default-kompatibel: ('id', 'title'), kann aber vom Aufrufer erweitert werden, z. B.:
-        validate_required_frontmatter(fm, required=("id","title","type","status","created"))
-
-    Hebt ValueError, falls Felder fehlen oder leer sind.
     """
     if fm is None:
         fm = {}
@@ -178,17 +169,13 @@ def validate_required_frontmatter(fm: Dict[str, Any],
     if missing:
         raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
 
-    # Plausibilitäten: 'tags' sollte eine Liste sein, wenn vorhanden
     if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
         raise ValueError("frontmatter 'tags' must be a list of strings")
 
 
 def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
     """
-    Sanfte Normalisierung ohne Semantikänderung:
-    - 'tags' → Liste von Strings (Trim)
-    - 'embedding_exclude' → bool
-    - andere Felder unverändert
+    Normalisierung von Tags und anderen Feldern.
     """
     out = dict(fm or {})
     if "tags" in out:
@@ -205,15 +192,12 @@ def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
 
 # ------------------------------ Wikilinks ---------------------------- #
 
-# Basismuster für [[...]]; die Normalisierung (id vor '#', vor '|') macht extract_wikilinks
 _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
 
 
 def extract_wikilinks(text: str) -> List[str]:
     """
-    Extrahiert Wikilinks wie [[id]], [[id#anchor]], [[id|label]], [[id#anchor|label]].
-    Rückgabe sind NUR die Ziel-IDs (ohne Anchor/Label), führend/folgend getrimmt.
-    Keine aggressive Slug-Normalisierung (die kann später im Resolver erfolgen).
+    Extrahiert Wikilinks als einfache Liste von IDs.
     """
     if not text:
         return []
@@ -222,12 +206,52 @@ def extract_wikilinks(text: str) -> List[str]:
         raw = (m.group(1) or "").strip()
         if not raw:
             continue
-        # Split an Pipe (Label) → links vor '|'
         if "|" in raw:
             raw = raw.split("|", 1)[0].strip()
-        # Split an Anchor
         if "#" in raw:
             raw = raw.split("#", 1)[0].strip()
         if raw:
             out.append(raw)
     return out
+
+
+def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
+    """
+    WP-22: Extrahiert Wikilinks [[Ziel|Typ]] aus dem Body und speichert die Zeilennummer.
+    Gibt eine Liste von Dictionaries zurück, die direkt von der Ingestion verarbeitet werden können.
+    """
+    edges = []
+    if not parsed or not parsed.body:
+        return edges
+
+    # Wir nutzen splitlines(True), um Zeilenumbrüche für die Positionsberechnung zu erhalten,
+    # oder einfaches splitlines() für die reine Zeilennummerierung.
+    lines = parsed.body.splitlines()
+    
+    for line_num, line_content in enumerate(lines, 1):
+        for match in _WIKILINK_RE.finditer(line_content):
+            raw = (match.group(1) or "").strip()
+            if not raw:
+                continue
+            
+            # Syntax: [[Ziel|Typ]]
+            if "|" in raw:
+                parts = raw.split("|", 1)
+                target = parts[0].strip()
+                kind = parts[1].strip()
+            else:
+                target = raw.strip()
+                kind = "related_to" # Default-Typ
+            
+            # Anchor (#) entfernen, da Relationen auf Notiz-Ebene (ID) basieren
+            if "#" in target:
+                target = target.split("#", 1)[0].strip()
+                
+            if target:
+                edges.append({
+                    "to": target,
+                    "kind": kind,
+                    "line": line_num,
+                    "provenance": "explicit"
+                })
+    return edges
\ No newline at end of file