from __future__ import annotations from dataclasses import dataclass import re import os import unicodedata import yaml from typing import Tuple, Dict FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang @dataclass class ParsedNote: frontmatter: Dict body: str path: str def _strip_bom(text: str) -> str: return text.lstrip("\ufeff") def _normalize_text(t: str) -> str: # Unicode-NFKC + vereinheitlichte Zeilenenden + Trim t = unicodedata.normalize("NFKC", t) t = t.replace("\r\n", "\n").replace("\r", "\n") return t def read_markdown(path: str) -> ParsedNote: with open(path, "r", encoding="utf-8") as f: raw = _strip_bom(f.read()) raw = _normalize_text(raw) m = FRONTMATTER_RE.match(raw) front, body = {}, raw if m: yaml_block = m.group(1) body = raw[m.end():] try: front = yaml.safe_load(yaml_block) or {} if not isinstance(front, dict): raise ValueError("Frontmatter must be a mapping") except yaml.YAMLError as e: raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e return ParsedNote(frontmatter=front, body=body, path=path) RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]] def extract_wikilinks(text: str) -> list[str]: return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)}) def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")): missing = [k for k in required if k not in fm or fm[k] in (None, "")] if missing: raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}") # einfache Plausibilitäten if not isinstance(fm.get("tags", []), (list, tuple)): if "tags" in fm and fm["tags"] not in (None, ""): raise ValueError("tags must be a list of strings") def normalize_frontmatter(fm: dict) -> dict: # kleinere Normalisierungen ohne die Semantik zu verändern out = dict(fm) if "tags" in out and isinstance(out["tags"], list): out["tags"] = [str(t).strip() for t in out["tags"]] if "embedding_exclude" in out: out["embedding_exclude"] = bool(out["embedding_exclude"]) return out