mindnet/app/core/parser/parsing_markdown.py
2025-12-27 14:31:12 +01:00

60 lines
2.1 KiB
Python

"""
FILE: app/core/parsing/parsing_markdown.py
DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting.
"""
import io
import os
import json
from typing import Any, Dict, Optional, Tuple
from .parsing_models import ParsedNote
from .parsing_utils import FRONTMATTER_RE
try:
import yaml
except ImportError:
yaml = None
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
"""Zerlegt Text in Frontmatter-Dict und Body."""
lines = text.splitlines(True)
if not lines or not FRONTMATTER_RE.match(lines[0]):
return {}, text
end_idx = None
for i in range(1, min(len(lines), 2000)):
if FRONTMATTER_RE.match(lines[i]):
end_idx = i
break
if end_idx is None: return {}, text
fm_raw = "".join(lines[1:end_idx])
body = "".join(lines[end_idx + 1:])
if yaml is None: raise RuntimeError("PyYAML not installed.")
try:
loaded = yaml.safe_load(fm_raw) or {}
data = loaded if isinstance(loaded, dict) else {}
except Exception as e:
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
data = {}
if body.startswith("\n"): body = body[1:]
return data, body
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
"""Liest Datei mit Encoding-Fallback-Kette."""
last_err = None
for enc in _FALLBACK_ENCODINGS:
try:
with io.open(path, "r", encoding=enc, errors="strict") as f:
return f.read(), enc, (enc != "utf-8")
except UnicodeDecodeError as e:
last_err = str(e); continue
with open(path, "rb") as fb:
text = fb.read().decode("utf-8", errors="replace")
return text, "utf-8(replace)", True
def read_markdown(path: str) -> Optional[ParsedNote]:
"""Öffentliche API zum Einlesen einer Datei."""
if not os.path.exists(path): return None
text, enc, had_fb = _read_text_with_fallback(path)
fm, body = _split_frontmatter(text)
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)