""" FILE: app/core/parsing/parsing_markdown.py DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting. """ import io import os import json from typing import Any, Dict, Optional, Tuple from .parsing_models import ParsedNote from .parsing_utils import FRONTMATTER_RE try: import yaml except ImportError: yaml = None _FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1") def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]: """Zerlegt Text in Frontmatter-Dict und Body.""" lines = text.splitlines(True) if not lines or not FRONTMATTER_RE.match(lines[0]): return {}, text end_idx = None for i in range(1, min(len(lines), 2000)): if FRONTMATTER_RE.match(lines[i]): end_idx = i break if end_idx is None: return {}, text fm_raw = "".join(lines[1:end_idx]) body = "".join(lines[end_idx + 1:]) if yaml is None: raise RuntimeError("PyYAML not installed.") try: loaded = yaml.safe_load(fm_raw) or {} data = loaded if isinstance(loaded, dict) else {} except Exception as e: print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)})) data = {} if body.startswith("\n"): body = body[1:] return data, body def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]: """Liest Datei mit Encoding-Fallback-Kette.""" last_err = None for enc in _FALLBACK_ENCODINGS: try: with io.open(path, "r", encoding=enc, errors="strict") as f: return f.read(), enc, (enc != "utf-8") except UnicodeDecodeError as e: last_err = str(e); continue with open(path, "rb") as fb: text = fb.read().decode("utf-8", errors="replace") return text, "utf-8(replace)", True def read_markdown(path: str) -> Optional[ParsedNote]: """Öffentliche API zum Einlesen einer Datei.""" if not os.path.exists(path): return None text, enc, had_fb = _read_text_with_fallback(path) fm, body = _split_frontmatter(text) return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)