#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ tests/compare_vaults.py Vergleicht zwei Markdown-Vaults rekursiv: - Schlüsselauswahl: id | title | filename | auto (default: auto = id>title>filename) - Fokus: body | frontmatter | all (default: body) - Unicode: NFC-Normalisierung für IDs/Titel/Dateinamen und Text Beispiele: python3 tests/compare_vaults.py --src ./vault --dst ./_exportVault --focus body python3 tests/compare_vaults.py --src ./vault --dst ./_exportVault --key id --focus all Ausgabe: - Einträge mit 'missing' (nur in src oder nur in dst) - Einträge mit 'diff' (vorhanden in beiden, aber Abweichungen gem. Fokus) - Summary-Objekt am Ende """ from __future__ import annotations import os import sys import json import argparse import unicodedata from typing import Dict, Tuple, Optional, Any PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) # Wir nutzen denselben Parser wie im Projekt (fehlertolerant) from app.core.parser import read_markdown # erwartet: (frontmatter, body) MD_EXTS = {".md", ".markdown"} def nfc(s: Optional[str]) -> str: return unicodedata.normalize("NFC", s or "") def norm_text(txt: str) -> str: # normalize line endings + strip trailing spaces per line + NFC t = txt.replace("\r\n", "\n").replace("\r", "\n") t = "\n".join(line.rstrip() for line in t.split("\n")).strip() return nfc(t) def iter_md_files(root: str): for base, _dirs, files in os.walk(root): for fn in files: ext = os.path.splitext(fn)[1].lower() if ext in MD_EXTS: yield os.path.join(base, fn) def key_from(front: Dict[str, Any], path: str, pref: str) -> str: """ Liefert den Vergleichsschlüssel. pref in {"auto","id","title","filename"} """ if pref == "id": return nfc(str(front.get("id", "")).strip()) if pref == "title": return nfc(str(front.get("title", "")).strip()) if pref == "filename": name = os.path.splitext(os.path.basename(path))[0] return nfc(name.strip()) # auto: id > title > filename if "id" in front and str(front["id"]).strip(): return nfc(str(front["id"]).strip()) if "title" in front and str(front["title"]).strip(): return nfc(str(front["title"]).strip()) name = os.path.splitext(os.path.basename(path))[0] return nfc(name.strip()) def load_index(root: str, key_pref: str) -> Dict[str, Tuple[str, Dict[str, Any], str]]: """ Baut ein Index-Dict: key -> (path, frontmatter, body) """ idx: Dict[str, Tuple[str, Dict[str, Any], str]] = {} for p in iter_md_files(root): try: fm, body = read_markdown(p) k = key_from(fm or {}, p, key_pref) if not k: # Fallback: filename k = key_from({}, p, "filename") idx[k] = (p, fm or {}, body or "") except Exception as e: # Datei nicht parsebar -> trotzdem indexieren nach Dateiname (leer FM/Body) k = key_from({}, p, key_pref if key_pref != "auto" else "filename") idx[k] = (p, {}, "") return idx def compare_entries(src: Tuple[str, Dict[str, Any], str], dst: Tuple[str, Dict[str, Any], str], focus: str) -> Dict[str, Any]: sp, sfm, sbody = src dp, dfm, dbody = dst res = {"status": "ok"} # wird 'diff' falls Unterschiede diffs = {} if focus in ("frontmatter", "all"): # Vergleiche FM minimal: id und title s_id = nfc(str(sfm.get("id", "")).strip()) d_id = nfc(str(dfm.get("id", "")).strip()) s_title = nfc(str(sfm.get("title", "")).strip()) d_title = nfc(str(dfm.get("title", "")).strip()) if s_id != d_id: diffs["frontmatter.id"] = {"src": s_id, "dst": d_id} if s_title != d_title: diffs["frontmatter.title"] = {"src": s_title, "dst": d_title} if focus in ("body", "all"): if norm_text(sbody) != norm_text(dbody): diffs["body"] = {"src_len": len(sbody or ""), "dst_len": len(dbody or "")} if diffs: res["status"] = "diff" res["diffs"] = diffs res["src_path"] = sp res["dst_path"] = dp return res def main(): ap = argparse.ArgumentParser() ap.add_argument("--src", required=True, help="Quell-Vault-Ordner") ap.add_argument("--dst", required=True, help="Export-Vault-Ordner") ap.add_argument("--focus", choices=["body", "frontmatter", "all"], default="body") ap.add_argument("--key", choices=["auto", "id", "title", "filename"], default="auto", help="Vergleichsschlüssel (default: auto=id>title>filename)") args = ap.parse_args() src_idx = load_index(args.src, args.key) dst_idx = load_index(args.dst, args.key) src_keys = set(src_idx.keys()) dst_keys = set(dst_idx.keys()) # Fehlende only_src = sorted(src_keys - dst_keys) only_dst = sorted(dst_keys - src_keys) count = 0 for k in only_src: print(json.dumps({"note_id": k, "status": "missing", "src": True, "dst": False}, ensure_ascii=False)) count += 1 for k in only_dst: print(json.dumps({"note_id": k, "status": "missing", "src": False, "dst": True}, ensure_ascii=False)) count += 1 # Vergleiche, wo vorhanden diff_count = 0 common = sorted(src_keys & dst_keys) for k in common: res = compare_entries(src_idx[k], dst_idx[k], args.focus) if res["status"] == "diff": res["note_id"] = k print(json.dumps(res, ensure_ascii=False)) diff_count += 1 summary = { "summary": "OK" if (count == 0 and diff_count == 0) else "DIFFS", "missing_count": count, "diff_count": diff_count, "total_src": len(src_keys), "total_dst": len(dst_keys), "focus": args.focus, "key": args.key, } print(json.dumps(summary, ensure_ascii=False)) # Exit-Code für CI if summary["summary"] != "OK": # non-zero exit optional; hier 0 lassen, damit CLI-Ausgaben sichtbar bleiben pass if __name__ == "__main__": main()