60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
"""
|
|
FILE: app/core/parsing/parsing_markdown.py
|
|
DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting.
|
|
"""
|
|
import io
|
|
import os
|
|
import json
|
|
from typing import Any, Dict, Optional, Tuple
|
|
from .parsing_models import ParsedNote
|
|
from .parsing_utils import FRONTMATTER_RE
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
yaml = None
|
|
|
|
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
|
|
|
|
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
|
|
"""Zerlegt Text in Frontmatter-Dict und Body."""
|
|
lines = text.splitlines(True)
|
|
if not lines or not FRONTMATTER_RE.match(lines[0]):
|
|
return {}, text
|
|
end_idx = None
|
|
for i in range(1, min(len(lines), 2000)):
|
|
if FRONTMATTER_RE.match(lines[i]):
|
|
end_idx = i
|
|
break
|
|
if end_idx is None: return {}, text
|
|
fm_raw = "".join(lines[1:end_idx])
|
|
body = "".join(lines[end_idx + 1:])
|
|
if yaml is None: raise RuntimeError("PyYAML not installed.")
|
|
try:
|
|
loaded = yaml.safe_load(fm_raw) or {}
|
|
data = loaded if isinstance(loaded, dict) else {}
|
|
except Exception as e:
|
|
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
|
|
data = {}
|
|
if body.startswith("\n"): body = body[1:]
|
|
return data, body
|
|
|
|
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
|
|
"""Liest Datei mit Encoding-Fallback-Kette."""
|
|
last_err = None
|
|
for enc in _FALLBACK_ENCODINGS:
|
|
try:
|
|
with io.open(path, "r", encoding=enc, errors="strict") as f:
|
|
return f.read(), enc, (enc != "utf-8")
|
|
except UnicodeDecodeError as e:
|
|
last_err = str(e); continue
|
|
with open(path, "rb") as fb:
|
|
text = fb.read().decode("utf-8", errors="replace")
|
|
return text, "utf-8(replace)", True
|
|
|
|
def read_markdown(path: str) -> Optional[ParsedNote]:
|
|
"""Öffentliche API zum Einlesen einer Datei."""
|
|
if not os.path.exists(path): return None
|
|
text, enc, had_fb = _read_text_with_fallback(path)
|
|
fm, body = _split_frontmatter(text)
|
|
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path) |