app/core/parser.py aktualisiert

2025-10-01 10:58:23 +02:00 · 2025-10-01 10:58:23 +02:00 · b4b7ea76ab
commit b4b7ea76ab
parent 6ea452cc3f
1 changed files with 244 additions and 45 deletions
--- a/app/core/parser.py
+++ b/app/core/parser.py
@ -1,67 +1,266 @@
-from __future__ import annotations
-from dataclasses import dataclass
-import re
-import os
-import unicodedata
-import yaml
-from typing import Tuple, Dict
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Modul: app/core/parser.py
+Version: 1.7.0 (fault-tolerant)
+Datum: 2025-10-01

-FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL)  # YAML-Frontmatter am Anfang
+Zweck
+-----
+Fehlertolerantes Einlesen von Markdown-Dateien mit YAML-Frontmatter.
+Kompatibel zur bisherigen Parser-API, aber robust gegenüber Nicht-UTF-8-Dateien:
+- Versucht nacheinander: utf-8 → utf-8-sig → cp1252 → latin-1.
+- Bei Fallback wird ein JSON-Warnhinweis auf stdout ausgegeben, Import bricht NICHT ab.
+- YAML-Frontmatter wird mit '---' am Anfang und '---' als Abschluss erkannt.
+- extract_wikilinks() normalisiert [[id#anchor|label]] → 'id'.
+
+Öffentliche API (kompatibel):
+- class ParsedNote(frontmatter: dict, body: str, path: str)
+- read_markdown(path) -> ParsedNote | None
+- normalize_frontmatter(fm) -> dict
+- validate_required_frontmatter(fm, required: tuple[str,...]=("id","title")) -> None
+- extract_wikilinks(text) -> list[str]
+
+Beispiele
+---------
+    # Einzelnes Markdown lesen
+    from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
+    parsed = read_markdown("./vault/30_projects/project-demo.md")
+    fm = normalize_frontmatter(parsed.frontmatter)
+    validate_required_frontmatter(fm)
+    body = parsed.body
+
+    # Wikilinks extrahieren
+    from app.core.parser import extract_wikilinks
+    links = extract_wikilinks(body)
+
+Abhängigkeiten
+--------------
+- PyYAML (yaml)
+
+Lizenz: MIT (projektintern)
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Iterable, List
+import io
+import json
+import os
+import re
+
+try:
+    import yaml  # PyYAML
+except Exception as e:  # pragma: no cover
+    yaml = None  # Fehler wird zur Laufzeit geworfen, falls wirklich benötigt
+
+
+# ---------------------------------------------------------------------
+# Datamodell
+# ---------------------------------------------------------------------

@dataclass
 class ParsedNote:
-    frontmatter: Dict
+    frontmatter: Dict[str, Any]
    body: str
    path: str

-def _strip_bom(text: str) -> str:
-    return text.lstrip("\ufeff")

-def _normalize_text(t: str) -> str:
-    # Unicode-NFKC + vereinheitlichte Zeilenenden + Trim
-    t = unicodedata.normalize("NFKC", t)
-    t = t.replace("\r\n", "\n").replace("\r", "\n")
-    return t
+# ---------------------------------------------------------------------
+# Frontmatter-Erkennung
+# ---------------------------------------------------------------------

-def read_markdown(path: str) -> ParsedNote:
-    with open(path, "r", encoding="utf-8") as f:
-        raw = _strip_bom(f.read())
-    raw = _normalize_text(raw)
+# YAML-Frontmatter am Anfang der Datei:
+# ---\n
+# <yaml>
+# ---\n
+_FRONTMATTER_HEAD = re.compile(r"^\s*---\s*$")
+_FRONTMATTER_END  = re.compile(r"^\s*---\s*$")

-    m = FRONTMATTER_RE.match(raw)
-    front, body = {}, raw
-    if m:
-        yaml_block = m.group(1)
-        body = raw[m.end():]
+
+def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
+    """
+    Zerlegt Text in (frontmatter: dict, body: str).
+    Erkennt Frontmatter nur, wenn die erste Zeile '---' ist und später ein zweites '---' folgt.
+    YAML-Fehler im Frontmatter führen NICHT zum Abbruch: es wird dann ein leeres dict benutzt.
+    """
+    lines = text.splitlines(True)  # keep line endings
+    if not lines:
+        return {}, ""
+
+    if not _FRONTMATTER_HEAD.match(lines[0]):
+        # kein Frontmatter-Header → gesamter Text ist Body
+        return {}, text
+
+    end_idx = None
+    # Suche nach nächstem '---' (max. 2000 Zeilen als Sicherheitslimit)
+    for i in range(1, min(len(lines), 2000)):
+        if _FRONTMATTER_END.match(lines[i]):
+            end_idx = i
+            break
+
+    if end_idx is None:
+        # unvollständiger Frontmatter-Block → behandle alles als Body
+        return {}, text
+
+    fm_raw = "".join(lines[1:end_idx])
+    body = "".join(lines[end_idx + 1:])
+
+    data: Dict[str, Any] = {}
+    if yaml is None:
+        raise RuntimeError("PyYAML ist nicht installiert (pip install pyyaml).")
+
+    try:
+        loaded = yaml.safe_load(fm_raw) or {}
+        if isinstance(loaded, dict):
+            data = loaded
+        else:
+            data = {}
+    except Exception as e:
+        # YAML-Fehler nicht fatal machen
+        print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
+        data = {}
+
+    # optionales kosmetisches Trim: eine führende Leerzeile im Body entfernen
+    if body.startswith("\n"):
+        body = body[1:]
+
+    return data, body
+
+
+# ---------------------------------------------------------------------
+# Robustes Lesen mit Encoding-Fallback
+# ---------------------------------------------------------------------
+
+_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
+
+
+def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
+    """
+    Liest Datei mit mehreren Decodierungsversuchen.
+    Rückgabe: (text, used_encoding, had_fallback)
+    - had_fallback=True, falls NICHT 'utf-8' verwendet wurde (oder 'utf-8-sig').
+    """
+    last_err: Optional[str] = None
+    for enc in _FALLBACK_ENCODINGS:
        try:
-            front = yaml.safe_load(yaml_block) or {}
-            if not isinstance(front, dict):
-                raise ValueError("Frontmatter must be a mapping")
-        except yaml.YAMLError as e:
-            raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e
+            with io.open(path, "r", encoding=enc, errors="strict") as f:
+                text = f.read()
+            # 'utf-8-sig' zählt hier als Fallback (weil BOM), aber ist unproblematisch
+            return text, enc, (enc != "utf-8")
+        except UnicodeDecodeError as e:
+            last_err = f"{type(e).__name__}: {e}"
+            continue

-    return ParsedNote(frontmatter=front, body=body, path=path)
+    # Letzter, extrem defensiver Fallback: Bytes → UTF-8 mit REPLACE (keine Exception)
+    with open(path, "rb") as fb:
+        raw = fb.read()
+    text = raw.decode("utf-8", errors="replace")
+    print(json.dumps({
+        "path": path,
+        "warn": "encoding_fallback_exhausted",
+        "info": last_err or "unknown"
+    }, ensure_ascii=False))
+    return text, "utf-8(replace)", True

-RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]")  # [[id]] | [[id#anchor]] | [[id|label]]

-def extract_wikilinks(text: str) -> list[str]:
-    return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)})
+# ---------------------------------------------------------------------
+# Öffentliche API
+# ---------------------------------------------------------------------

-def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")):
-    missing = [k for k in required if k not in fm or fm[k] in (None, "")]
+def read_markdown(path: str) -> Optional[ParsedNote]:
+    """
+    Liest eine Markdown-Datei fehlertolerant:
+    - Erlaubt verschiedene Encodings (UTF-8 bevorzugt, cp1252/latin-1 als Fallback).
+    - Schlägt NICHT mit UnicodeDecodeError fehl.
+    - Gibt ParsedNote(frontmatter, body, path) zurück oder None, falls die Datei nicht existiert.
+
+    Bei Decoding-Fallback wird ein JSON-Warnhinweis geloggt:
+        {"path": "...", "warn": "encoding_fallback_used", "used": "cp1252"}
+    """
+    if not os.path.exists(path):
+        return None
+
+    text, enc, had_fb = _read_text_with_fallback(path)
+    if had_fb:
+        print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": enc}, ensure_ascii=False))
+
+    fm, body = _split_frontmatter(text)
+    return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
+
+
+def validate_required_frontmatter(fm: Dict[str, Any],
+                                  required: Tuple[str, ...] = ("id", "title")) -> None:
+    """
+    Prüft, ob alle Pflichtfelder vorhanden sind.
+    Default-kompatibel: ('id', 'title'), kann aber vom Aufrufer erweitert werden, z. B.:
+        validate_required_frontmatter(fm, required=("id","title","type","status","created"))
+
+    Hebt ValueError, falls Felder fehlen oder leer sind.
+    """
+    if fm is None:
+        fm = {}
+    missing = []
+    for k in required:
+        v = fm.get(k)
+        if v is None:
+            missing.append(k)
+        elif isinstance(v, str) and not v.strip():
+            missing.append(k)
    if missing:
        raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")

-    # einfache Plausibilitäten
-    if not isinstance(fm.get("tags", []), (list, tuple)):
-        if "tags" in fm and fm["tags"] not in (None, ""):
-            raise ValueError("tags must be a list of strings")
+    # Plausibilitäten: 'tags' sollte eine Liste sein, wenn vorhanden
+    if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
+        raise ValueError("frontmatter 'tags' must be a list of strings")

-def normalize_frontmatter(fm: dict) -> dict:
-    # kleinere Normalisierungen ohne die Semantik zu verändern
-    out = dict(fm)
-    if "tags" in out and isinstance(out["tags"], list):
-        out["tags"] = [str(t).strip() for t in out["tags"]]
+
+def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sanfte Normalisierung ohne Semantikänderung:
+    - 'tags' → Liste von Strings (Trim)
+    - 'embedding_exclude' → bool
+    - andere Felder unverändert
+    """
+    out = dict(fm or {})
+    if "tags" in out:
+        if isinstance(out["tags"], str):
+            out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
+        elif isinstance(out["tags"], list):
+            out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
+        else:
+            # Unbekannter Typ → in Liste mit String umwandeln
+            out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
    if "embedding_exclude" in out:
        out["embedding_exclude"] = bool(out["embedding_exclude"])
    return out
+
+
+# ------------------------------ Wikilinks ---------------------------- #
+
+# Basismuster für [[...]]; die Normalisierung (id vor '#', vor '|') macht extract_wikilinks
+_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
+
+
+def extract_wikilinks(text: str) -> List[str]:
+    """
+    Extrahiert Wikilinks wie [[id]], [[id#anchor]], [[id|label]], [[id#anchor|label]].
+    Rückgabe sind NUR die Ziel-IDs (ohne Anchor/Label), führend/folgend getrimmt.
+    Keine aggressive Slug-Normalisierung (die kann später im Resolver erfolgen).
+    """
+    if not text:
+        return []
+    out: List[str] = []
+    for m in _WIKILINK_RE.finditer(text):
+        raw = (m.group(1) or "").strip()
+        if not raw:
+            continue
+        # Split an Pipe (Label) → links vor '|'
+        if "|" in raw:
+            raw = raw.split("|", 1)[0].strip()
+        # Split an Anchor
+        if "#" in raw:
+            raw = raw.split("#", 1)[0].strip()
+        if raw:
+            out.append(raw)
+    return out