WP20 - parser

2025-12-23 14:38:27 +01:00 · 2025-12-23 14:38:27 +01:00 · 0ac8a14ea7
commit 0ac8a14ea7
parent 234949800b
1 changed files with 48 additions and 24 deletions
--- a/app/core/parser.py
+++ b/app/core/parser.py
@ -1,10 +1,11 @@
 """
 FILE: app/core/parser.py
 DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
-VERSION: 1.7.1
+             WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
+VERSION: 1.8.0
 STATUS: Active
 DEPENDENCIES: yaml, re, dataclasses, json, io, os
-LAST_ANALYSIS: 2025-12-15
+LAST_ANALYSIS: 2025-12-23
 """
 from __future__ import annotations

@ -138,13 +139,7 @@ def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:

 def read_markdown(path: str) -> Optional[ParsedNote]:
    """
-    Liest eine Markdown-Datei fehlertolerant:
-    - Erlaubt verschiedene Encodings (UTF-8 bevorzugt, cp1252/latin-1 als Fallback).
-    - Schlägt NICHT mit UnicodeDecodeError fehl.
-    - Gibt ParsedNote(frontmatter, body, path) zurück oder None, falls die Datei nicht existiert.
-
-    Bei Decoding-Fallback wird ein JSON-Warnhinweis geloggt:
-        {"path": "...", "warn": "encoding_fallback_used", "used": "cp1252"}
+    Liest eine Markdown-Datei fehlertolerant.
    """
    if not os.path.exists(path):
        return None
@ -161,10 +156,6 @@ def validate_required_frontmatter(fm: Dict[str, Any],
                                  required: Tuple[str, ...] = ("id", "title")) -> None:
    """
    Prüft, ob alle Pflichtfelder vorhanden sind.
-    Default-kompatibel: ('id', 'title'), kann aber vom Aufrufer erweitert werden, z. B.:
-        validate_required_frontmatter(fm, required=("id","title","type","status","created"))
-
-    Hebt ValueError, falls Felder fehlen oder leer sind.
    """
    if fm is None:
        fm = {}
@ -178,17 +169,13 @@ def validate_required_frontmatter(fm: Dict[str, Any],
    if missing:
        raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")

-    # Plausibilitäten: 'tags' sollte eine Liste sein, wenn vorhanden
    if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
        raise ValueError("frontmatter 'tags' must be a list of strings")


 def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
    """
-    Sanfte Normalisierung ohne Semantikänderung:
-    - 'tags' → Liste von Strings (Trim)
-    - 'embedding_exclude' → bool
-    - andere Felder unverändert
+    Normalisierung von Tags und anderen Feldern.
    """
    out = dict(fm or {})
    if "tags" in out:
@ -205,15 +192,12 @@ def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:

 # ------------------------------ Wikilinks ---------------------------- #

-# Basismuster für [[...]]; die Normalisierung (id vor '#', vor '|') macht extract_wikilinks
 _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")


 def extract_wikilinks(text: str) -> List[str]:
    """
-    Extrahiert Wikilinks wie [[id]], [[id#anchor]], [[id|label]], [[id#anchor|label]].
-    Rückgabe sind NUR die Ziel-IDs (ohne Anchor/Label), führend/folgend getrimmt.
-    Keine aggressive Slug-Normalisierung (die kann später im Resolver erfolgen).
+    Extrahiert Wikilinks als einfache Liste von IDs.
    """
    if not text:
        return []
@ -222,12 +206,52 @@ def extract_wikilinks(text: str) -> List[str]:
        raw = (m.group(1) or "").strip()
        if not raw:
            continue
-        # Split an Pipe (Label) → links vor '|'
        if "|" in raw:
            raw = raw.split("|", 1)[0].strip()
-        # Split an Anchor
        if "#" in raw:
            raw = raw.split("#", 1)[0].strip()
        if raw:
            out.append(raw)
    return out
+
+
+def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
+    """
+    WP-22: Extrahiert Wikilinks [[Ziel|Typ]] aus dem Body und speichert die Zeilennummer.
+    Gibt eine Liste von Dictionaries zurück, die direkt von der Ingestion verarbeitet werden können.
+    """
+    edges = []
+    if not parsed or not parsed.body:
+        return edges
+
+    # Wir nutzen splitlines(True), um Zeilenumbrüche für die Positionsberechnung zu erhalten,
+    # oder einfaches splitlines() für die reine Zeilennummerierung.
+    lines = parsed.body.splitlines()
+    
+    for line_num, line_content in enumerate(lines, 1):
+        for match in _WIKILINK_RE.finditer(line_content):
+            raw = (match.group(1) or "").strip()
+            if not raw:
+                continue
+            
+            # Syntax: [[Ziel|Typ]]
+            if "|" in raw:
+                parts = raw.split("|", 1)
+                target = parts[0].strip()
+                kind = parts[1].strip()
+            else:
+                target = raw.strip()
+                kind = "related_to" # Default-Typ
+            
+            # Anchor (#) entfernen, da Relationen auf Notiz-Ebene (ID) basieren
+            if "#" in target:
+                target = target.split("#", 1)[0].strip()
+                
+            if target:
+                edges.append({
+                    "to": target,
+                    "kind": kind,
+                    "line": line_num,
+                    "provenance": "explicit"
+                })
+    return edges