app/core/parser.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
6ea452cc3f
commit
b4b7ea76ab
|
|
@ -1,67 +1,266 @@
|
||||||
from __future__ import annotations
|
#!/usr/bin/env python3
|
||||||
from dataclasses import dataclass
|
# -*- coding: utf-8 -*-
|
||||||
import re
|
"""
|
||||||
import os
|
Modul: app/core/parser.py
|
||||||
import unicodedata
|
Version: 1.7.0 (fault-tolerant)
|
||||||
import yaml
|
Datum: 2025-10-01
|
||||||
from typing import Tuple, Dict
|
|
||||||
|
|
||||||
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang
|
Zweck
|
||||||
|
-----
|
||||||
|
Fehlertolerantes Einlesen von Markdown-Dateien mit YAML-Frontmatter.
|
||||||
|
Kompatibel zur bisherigen Parser-API, aber robust gegenüber Nicht-UTF-8-Dateien:
|
||||||
|
- Versucht nacheinander: utf-8 → utf-8-sig → cp1252 → latin-1.
|
||||||
|
- Bei Fallback wird ein JSON-Warnhinweis auf stdout ausgegeben, Import bricht NICHT ab.
|
||||||
|
- YAML-Frontmatter wird mit '---' am Anfang und '---' als Abschluss erkannt.
|
||||||
|
- extract_wikilinks() normalisiert [[id#anchor|label]] → 'id'.
|
||||||
|
|
||||||
|
Öffentliche API (kompatibel):
|
||||||
|
- class ParsedNote(frontmatter: dict, body: str, path: str)
|
||||||
|
- read_markdown(path) -> ParsedNote | None
|
||||||
|
- normalize_frontmatter(fm) -> dict
|
||||||
|
- validate_required_frontmatter(fm, required: tuple[str,...]=("id","title")) -> None
|
||||||
|
- extract_wikilinks(text) -> list[str]
|
||||||
|
|
||||||
|
Beispiele
|
||||||
|
---------
|
||||||
|
# Einzelnes Markdown lesen
|
||||||
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
|
parsed = read_markdown("./vault/30_projects/project-demo.md")
|
||||||
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
|
validate_required_frontmatter(fm)
|
||||||
|
body = parsed.body
|
||||||
|
|
||||||
|
# Wikilinks extrahieren
|
||||||
|
from app.core.parser import extract_wikilinks
|
||||||
|
links = extract_wikilinks(body)
|
||||||
|
|
||||||
|
Abhängigkeiten
|
||||||
|
--------------
|
||||||
|
- PyYAML (yaml)
|
||||||
|
|
||||||
|
Lizenz: MIT (projektintern)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, Optional, Tuple, Iterable, List
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml # PyYAML
|
||||||
|
except Exception as e: # pragma: no cover
|
||||||
|
yaml = None # Fehler wird zur Laufzeit geworfen, falls wirklich benötigt
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# Datamodell
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ParsedNote:
|
class ParsedNote:
|
||||||
frontmatter: Dict
|
frontmatter: Dict[str, Any]
|
||||||
body: str
|
body: str
|
||||||
path: str
|
path: str
|
||||||
|
|
||||||
def _strip_bom(text: str) -> str:
|
|
||||||
return text.lstrip("\ufeff")
|
|
||||||
|
|
||||||
def _normalize_text(t: str) -> str:
|
# ---------------------------------------------------------------------
|
||||||
# Unicode-NFKC + vereinheitlichte Zeilenenden + Trim
|
# Frontmatter-Erkennung
|
||||||
t = unicodedata.normalize("NFKC", t)
|
# ---------------------------------------------------------------------
|
||||||
t = t.replace("\r\n", "\n").replace("\r", "\n")
|
|
||||||
return t
|
|
||||||
|
|
||||||
def read_markdown(path: str) -> ParsedNote:
|
# YAML-Frontmatter am Anfang der Datei:
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
# ---\n
|
||||||
raw = _strip_bom(f.read())
|
# <yaml>
|
||||||
raw = _normalize_text(raw)
|
# ---\n
|
||||||
|
_FRONTMATTER_HEAD = re.compile(r"^\s*---\s*$")
|
||||||
|
_FRONTMATTER_END = re.compile(r"^\s*---\s*$")
|
||||||
|
|
||||||
m = FRONTMATTER_RE.match(raw)
|
|
||||||
front, body = {}, raw
|
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
|
||||||
if m:
|
"""
|
||||||
yaml_block = m.group(1)
|
Zerlegt Text in (frontmatter: dict, body: str).
|
||||||
body = raw[m.end():]
|
Erkennt Frontmatter nur, wenn die erste Zeile '---' ist und später ein zweites '---' folgt.
|
||||||
|
YAML-Fehler im Frontmatter führen NICHT zum Abbruch: es wird dann ein leeres dict benutzt.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines(True) # keep line endings
|
||||||
|
if not lines:
|
||||||
|
return {}, ""
|
||||||
|
|
||||||
|
if not _FRONTMATTER_HEAD.match(lines[0]):
|
||||||
|
# kein Frontmatter-Header → gesamter Text ist Body
|
||||||
|
return {}, text
|
||||||
|
|
||||||
|
end_idx = None
|
||||||
|
# Suche nach nächstem '---' (max. 2000 Zeilen als Sicherheitslimit)
|
||||||
|
for i in range(1, min(len(lines), 2000)):
|
||||||
|
if _FRONTMATTER_END.match(lines[i]):
|
||||||
|
end_idx = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if end_idx is None:
|
||||||
|
# unvollständiger Frontmatter-Block → behandle alles als Body
|
||||||
|
return {}, text
|
||||||
|
|
||||||
|
fm_raw = "".join(lines[1:end_idx])
|
||||||
|
body = "".join(lines[end_idx + 1:])
|
||||||
|
|
||||||
|
data: Dict[str, Any] = {}
|
||||||
|
if yaml is None:
|
||||||
|
raise RuntimeError("PyYAML ist nicht installiert (pip install pyyaml).")
|
||||||
|
|
||||||
|
try:
|
||||||
|
loaded = yaml.safe_load(fm_raw) or {}
|
||||||
|
if isinstance(loaded, dict):
|
||||||
|
data = loaded
|
||||||
|
else:
|
||||||
|
data = {}
|
||||||
|
except Exception as e:
|
||||||
|
# YAML-Fehler nicht fatal machen
|
||||||
|
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# optionales kosmetisches Trim: eine führende Leerzeile im Body entfernen
|
||||||
|
if body.startswith("\n"):
|
||||||
|
body = body[1:]
|
||||||
|
|
||||||
|
return data, body
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
# Robustes Lesen mit Encoding-Fallback
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
|
||||||
|
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
|
||||||
|
"""
|
||||||
|
Liest Datei mit mehreren Decodierungsversuchen.
|
||||||
|
Rückgabe: (text, used_encoding, had_fallback)
|
||||||
|
- had_fallback=True, falls NICHT 'utf-8' verwendet wurde (oder 'utf-8-sig').
|
||||||
|
"""
|
||||||
|
last_err: Optional[str] = None
|
||||||
|
for enc in _FALLBACK_ENCODINGS:
|
||||||
try:
|
try:
|
||||||
front = yaml.safe_load(yaml_block) or {}
|
with io.open(path, "r", encoding=enc, errors="strict") as f:
|
||||||
if not isinstance(front, dict):
|
text = f.read()
|
||||||
raise ValueError("Frontmatter must be a mapping")
|
# 'utf-8-sig' zählt hier als Fallback (weil BOM), aber ist unproblematisch
|
||||||
except yaml.YAMLError as e:
|
return text, enc, (enc != "utf-8")
|
||||||
raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e
|
except UnicodeDecodeError as e:
|
||||||
|
last_err = f"{type(e).__name__}: {e}"
|
||||||
|
continue
|
||||||
|
|
||||||
return ParsedNote(frontmatter=front, body=body, path=path)
|
# Letzter, extrem defensiver Fallback: Bytes → UTF-8 mit REPLACE (keine Exception)
|
||||||
|
with open(path, "rb") as fb:
|
||||||
|
raw = fb.read()
|
||||||
|
text = raw.decode("utf-8", errors="replace")
|
||||||
|
print(json.dumps({
|
||||||
|
"path": path,
|
||||||
|
"warn": "encoding_fallback_exhausted",
|
||||||
|
"info": last_err or "unknown"
|
||||||
|
}, ensure_ascii=False))
|
||||||
|
return text, "utf-8(replace)", True
|
||||||
|
|
||||||
RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]]
|
|
||||||
|
|
||||||
def extract_wikilinks(text: str) -> list[str]:
|
# ---------------------------------------------------------------------
|
||||||
return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)})
|
# Öffentliche API
|
||||||
|
# ---------------------------------------------------------------------
|
||||||
|
|
||||||
def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")):
|
def read_markdown(path: str) -> Optional[ParsedNote]:
|
||||||
missing = [k for k in required if k not in fm or fm[k] in (None, "")]
|
"""
|
||||||
|
Liest eine Markdown-Datei fehlertolerant:
|
||||||
|
- Erlaubt verschiedene Encodings (UTF-8 bevorzugt, cp1252/latin-1 als Fallback).
|
||||||
|
- Schlägt NICHT mit UnicodeDecodeError fehl.
|
||||||
|
- Gibt ParsedNote(frontmatter, body, path) zurück oder None, falls die Datei nicht existiert.
|
||||||
|
|
||||||
|
Bei Decoding-Fallback wird ein JSON-Warnhinweis geloggt:
|
||||||
|
{"path": "...", "warn": "encoding_fallback_used", "used": "cp1252"}
|
||||||
|
"""
|
||||||
|
if not os.path.exists(path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
text, enc, had_fb = _read_text_with_fallback(path)
|
||||||
|
if had_fb:
|
||||||
|
print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": enc}, ensure_ascii=False))
|
||||||
|
|
||||||
|
fm, body = _split_frontmatter(text)
|
||||||
|
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_required_frontmatter(fm: Dict[str, Any],
|
||||||
|
required: Tuple[str, ...] = ("id", "title")) -> None:
|
||||||
|
"""
|
||||||
|
Prüft, ob alle Pflichtfelder vorhanden sind.
|
||||||
|
Default-kompatibel: ('id', 'title'), kann aber vom Aufrufer erweitert werden, z. B.:
|
||||||
|
validate_required_frontmatter(fm, required=("id","title","type","status","created"))
|
||||||
|
|
||||||
|
Hebt ValueError, falls Felder fehlen oder leer sind.
|
||||||
|
"""
|
||||||
|
if fm is None:
|
||||||
|
fm = {}
|
||||||
|
missing = []
|
||||||
|
for k in required:
|
||||||
|
v = fm.get(k)
|
||||||
|
if v is None:
|
||||||
|
missing.append(k)
|
||||||
|
elif isinstance(v, str) and not v.strip():
|
||||||
|
missing.append(k)
|
||||||
if missing:
|
if missing:
|
||||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||||
|
|
||||||
# einfache Plausibilitäten
|
# Plausibilitäten: 'tags' sollte eine Liste sein, wenn vorhanden
|
||||||
if not isinstance(fm.get("tags", []), (list, tuple)):
|
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
|
||||||
if "tags" in fm and fm["tags"] not in (None, ""):
|
raise ValueError("frontmatter 'tags' must be a list of strings")
|
||||||
raise ValueError("tags must be a list of strings")
|
|
||||||
|
|
||||||
def normalize_frontmatter(fm: dict) -> dict:
|
|
||||||
# kleinere Normalisierungen ohne die Semantik zu verändern
|
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
out = dict(fm)
|
"""
|
||||||
if "tags" in out and isinstance(out["tags"], list):
|
Sanfte Normalisierung ohne Semantikänderung:
|
||||||
out["tags"] = [str(t).strip() for t in out["tags"]]
|
- 'tags' → Liste von Strings (Trim)
|
||||||
|
- 'embedding_exclude' → bool
|
||||||
|
- andere Felder unverändert
|
||||||
|
"""
|
||||||
|
out = dict(fm or {})
|
||||||
|
if "tags" in out:
|
||||||
|
if isinstance(out["tags"], str):
|
||||||
|
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
|
||||||
|
elif isinstance(out["tags"], list):
|
||||||
|
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
|
||||||
|
else:
|
||||||
|
# Unbekannter Typ → in Liste mit String umwandeln
|
||||||
|
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
|
||||||
if "embedding_exclude" in out:
|
if "embedding_exclude" in out:
|
||||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------ Wikilinks ---------------------------- #
|
||||||
|
|
||||||
|
# Basismuster für [[...]]; die Normalisierung (id vor '#', vor '|') macht extract_wikilinks
|
||||||
|
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_wikilinks(text: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extrahiert Wikilinks wie [[id]], [[id#anchor]], [[id|label]], [[id#anchor|label]].
|
||||||
|
Rückgabe sind NUR die Ziel-IDs (ohne Anchor/Label), führend/folgend getrimmt.
|
||||||
|
Keine aggressive Slug-Normalisierung (die kann später im Resolver erfolgen).
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
out: List[str] = []
|
||||||
|
for m in _WIKILINK_RE.finditer(text):
|
||||||
|
raw = (m.group(1) or "").strip()
|
||||||
|
if not raw:
|
||||||
|
continue
|
||||||
|
# Split an Pipe (Label) → links vor '|'
|
||||||
|
if "|" in raw:
|
||||||
|
raw = raw.split("|", 1)[0].strip()
|
||||||
|
# Split an Anchor
|
||||||
|
if "#" in raw:
|
||||||
|
raw = raw.split("#", 1)[0].strip()
|
||||||
|
if raw:
|
||||||
|
out.append(raw)
|
||||||
|
return out
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user