app/core/parser.py hinzugefügt
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
039e5b80df
commit
b076cabae7
67
app/core/parser.py
Normal file
67
app/core/parser.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import os
|
||||
import unicodedata
|
||||
import yaml
|
||||
from typing import Tuple, Dict
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang
|
||||
|
||||
@dataclass
|
||||
class ParsedNote:
|
||||
frontmatter: Dict
|
||||
body: str
|
||||
path: str
|
||||
|
||||
def _strip_bom(text: str) -> str:
|
||||
return text.lstrip("\ufeff")
|
||||
|
||||
def _normalize_text(t: str) -> str:
|
||||
# Unicode-NFKC + vereinheitlichte Zeilenenden + Trim
|
||||
t = unicodedata.normalize("NFKC", t)
|
||||
t = t.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return t
|
||||
|
||||
def read_markdown(path: str) -> ParsedNote:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
raw = _strip_bom(f.read())
|
||||
raw = _normalize_text(raw)
|
||||
|
||||
m = FRONTMATTER_RE.match(raw)
|
||||
front, body = {}, raw
|
||||
if m:
|
||||
yaml_block = m.group(1)
|
||||
body = raw[m.end():]
|
||||
try:
|
||||
front = yaml.safe_load(yaml_block) or {}
|
||||
if not isinstance(front, dict):
|
||||
raise ValueError("Frontmatter must be a mapping")
|
||||
except yaml.YAMLError as e:
|
||||
raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e
|
||||
|
||||
return ParsedNote(frontmatter=front, body=body, path=path)
|
||||
|
||||
RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]]
|
||||
|
||||
def extract_wikilinks(text: str) -> list[str]:
|
||||
return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)})
|
||||
|
||||
def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")):
|
||||
missing = [k for k in required if k not in fm or fm[k] in (None, "")]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||
|
||||
# einfache Plausibilitäten
|
||||
if not isinstance(fm.get("tags", []), (list, tuple)):
|
||||
if "tags" in fm and fm["tags"] not in (None, ""):
|
||||
raise ValueError("tags must be a list of strings")
|
||||
|
||||
def normalize_frontmatter(fm: dict) -> dict:
|
||||
# kleinere Normalisierungen ohne die Semantik zu verändern
|
||||
out = dict(fm)
|
||||
if "tags" in out and isinstance(out["tags"], list):
|
||||
out["tags"] = [str(t).strip() for t in out["tags"]]
|
||||
if "embedding_exclude" in out:
|
||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||
return out
|
||||
Loading…
Reference in New Issue
Block a user