From b076cabae75aaa2604693ccee50cc6c9f3a04cce Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 2 Sep 2025 19:50:45 +0200 Subject: [PATCH] =?UTF-8?q?app/core/parser.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/parser.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 app/core/parser.py diff --git a/app/core/parser.py b/app/core/parser.py new file mode 100644 index 0000000..549e119 --- /dev/null +++ b/app/core/parser.py @@ -0,0 +1,67 @@ +from __future__ import annotations +from dataclasses import dataclass +import re +import os +import unicodedata +import yaml +from typing import Tuple, Dict + +FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang + +@dataclass +class ParsedNote: + frontmatter: Dict + body: str + path: str + +def _strip_bom(text: str) -> str: + return text.lstrip("\ufeff") + +def _normalize_text(t: str) -> str: + # Unicode-NFKC + vereinheitlichte Zeilenenden + Trim + t = unicodedata.normalize("NFKC", t) + t = t.replace("\r\n", "\n").replace("\r", "\n") + return t + +def read_markdown(path: str) -> ParsedNote: + with open(path, "r", encoding="utf-8") as f: + raw = _strip_bom(f.read()) + raw = _normalize_text(raw) + + m = FRONTMATTER_RE.match(raw) + front, body = {}, raw + if m: + yaml_block = m.group(1) + body = raw[m.end():] + try: + front = yaml.safe_load(yaml_block) or {} + if not isinstance(front, dict): + raise ValueError("Frontmatter must be a mapping") + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e + + return ParsedNote(frontmatter=front, body=body, path=path) + +RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]] + +def extract_wikilinks(text: str) -> list[str]: + return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)}) + +def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")): + missing = [k for k in required if k not in fm or fm[k] in (None, "")] + if missing: + raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}") + + # einfache Plausibilitäten + if not isinstance(fm.get("tags", []), (list, tuple)): + if "tags" in fm and fm["tags"] not in (None, ""): + raise ValueError("tags must be a list of strings") + +def normalize_frontmatter(fm: dict) -> dict: + # kleinere Normalisierungen ohne die Semantik zu verändern + out = dict(fm) + if "tags" in out and isinstance(out["tags"], list): + out["tags"] = [str(t).strip() for t in out["tags"]] + if "embedding_exclude" in out: + out["embedding_exclude"] = bool(out["embedding_exclude"]) + return out