app/core/parser.py hinzugefügt
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-09-02 19:50:45 +02:00
parent 039e5b80df
commit b076cabae7

67
app/core/parser.py Normal file
View File

@ -0,0 +1,67 @@
from __future__ import annotations
from dataclasses import dataclass
import re
import os
import unicodedata
import yaml
from typing import Tuple, Dict
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang
@dataclass
class ParsedNote:
frontmatter: Dict
body: str
path: str
def _strip_bom(text: str) -> str:
return text.lstrip("\ufeff")
def _normalize_text(t: str) -> str:
# Unicode-NFKC + vereinheitlichte Zeilenenden + Trim
t = unicodedata.normalize("NFKC", t)
t = t.replace("\r\n", "\n").replace("\r", "\n")
return t
def read_markdown(path: str) -> ParsedNote:
with open(path, "r", encoding="utf-8") as f:
raw = _strip_bom(f.read())
raw = _normalize_text(raw)
m = FRONTMATTER_RE.match(raw)
front, body = {}, raw
if m:
yaml_block = m.group(1)
body = raw[m.end():]
try:
front = yaml.safe_load(yaml_block) or {}
if not isinstance(front, dict):
raise ValueError("Frontmatter must be a mapping")
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e
return ParsedNote(frontmatter=front, body=body, path=path)
RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]]
def extract_wikilinks(text: str) -> list[str]:
return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)})
def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")):
missing = [k for k in required if k not in fm or fm[k] in (None, "")]
if missing:
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
# einfache Plausibilitäten
if not isinstance(fm.get("tags", []), (list, tuple)):
if "tags" in fm and fm["tags"] not in (None, ""):
raise ValueError("tags must be a list of strings")
def normalize_frontmatter(fm: dict) -> dict:
# kleinere Normalisierungen ohne die Semantik zu verändern
out = dict(fm)
if "tags" in out and isinstance(out["tags"], list):
out["tags"] = [str(t).strip() for t in out["tags"]]
if "embedding_exclude" in out:
out["embedding_exclude"] = bool(out["embedding_exclude"])
return out