From 300086fc83a80f2550a576e01e3b7b4b5f811dde Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 17 Nov 2025 10:36:14 +0100 Subject: [PATCH] Dateien nach "scripts" hochladen --- scripts/import_markdown.py | 522 +++++++++++++++++++++++-------------- 1 file changed, 325 insertions(+), 197 deletions(-) diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 6de95a3..ac04459 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -1,18 +1,48 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -scripts/import_markdown.py (Mindnet V2 — Importer, v2.5.2) +scripts/import_markdown.py -Änderungen in 2.5.2 (minimal & gezielt) ---------------------------------------- -- **Explizite Spiegelung** von `chunk_profile` in Note- und Chunk-Payload: - note_pl["chunk_profile"] = fm.get("chunk_profile"); für jeden Chunk cpl["chunk_profile"] = fm.get("chunk_profile"). -- **Explizite Spiegelung** von `retriever_weight` in jedem Chunk (falls Builder es nicht gesetzt hat). -- **Feld-basierte Change-Erkennung** erweitert: - Wenn bestehende Note-Payload (`old_payload`) bei `retriever_weight` oder `chunk_profile` vom neuen Wert abweicht - oder ein Feld fehlt, wird `changed = True` gesetzt → Upsert erzwingen. -- **Robuste Übergabe** von Type-Registry an Chunk-Builder: - `types_cfg=(reg.get('types') or reg or {})` – damit greifen Resolver in chunk_payload.py sicher auf `types.yaml` zu. +Zweck +----- +- Liest Markdown-Notizen aus einem Vault ein +- Erzeugt Note-Payload, Chunk-Payloads (+ optionale Embeddings) und Edges +- Schreibt alles idempotent in Qdrant (Notes, Chunks, Edges) +- Integriert eine optionale Type-Registry (types.yaml), um z. B. chunk_profile + und retriever_weight pro Notiz-Typ zu steuern. + +Wesentliche Fixes ggü. vorherigen fehlerhaften Ständen +------------------------------------------------------ +- `embed_texts` wird optional importiert und defensiv geprüft (kein NameError mehr) +- `effective_chunk_profile` / `effective_retriever_weight` und Registry-Helfer + sind VOR `main()` definiert (kein NameError mehr) +- `retriever_weight` wird in Note- und Chunk-Payload zuverlässig gesetzt +- Robuste Kantenbildung; Fehler bei Edges blockieren Notes/Chunks nicht +- Korrekte Verwendung von `scroll_filter` beim Qdrant-Client +- `--purge-before-upsert` entfernt alte Chunks/Edges einer Note vor dem Upsert + +Qdrant / ENV +------------ +- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY +- COLLECTION_PREFIX (Default: mindnet), via --prefix überschreibbar +- VECTOR_DIM (Default: 384) +- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false) +- MINDNET_TYPES_FILE: Pfad zu types.yaml (optional; Default: ./types.yaml) + +Beispiele +--------- + # Standard (Body, parsed, canonical) + python3 -m scripts.import_markdown --vault ./vault + + # Erstimport nach truncate (Create-Fall) + python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert + + # Nur eine Datei (Diagnose) + python3 -m scripts.import_markdown --vault ./vault --only-path ./vault/30_projects/project-demo.md --apply + + # Sync-Deletes (Dry-Run → Apply) + python3 -m scripts.import_markdown --vault ./vault --sync-deletes + python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply """ from __future__ import annotations @@ -24,109 +54,151 @@ from typing import Dict, List, Optional, Tuple, Any, Set from dotenv import load_dotenv from qdrant_client.http import models as rest -import yaml # --- Projekt-Imports --- -from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter +from app.core.parser import ( + read_markdown, + normalize_frontmatter, + validate_required_frontmatter, +) from app.core.note_payload import make_note_payload from app.core.chunker import assemble_chunks from app.core.chunk_payload import make_chunk_payloads try: from app.core.derive_edges import build_edges_for_note -except Exception: +except Exception: # pragma: no cover from app.core.edges import build_edges_for_note # type: ignore -from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes -from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch +from app.core.qdrant import ( + QdrantConfig, + get_client, + ensure_collections, + ensure_payload_indexes, +) +from app.core.qdrant_points import ( + points_for_chunks, + points_for_note, + points_for_edges, + upsert_batch, +) -# embeddings optional +# embeddings sind optional (z. B. im reinen Payload-Backfill) try: from app.core.embed import embed_texts # optional -except Exception: - embed_texts = None # type: ignore +except Exception: # pragma: no cover + embed_texts = None -# ============================================================ -# Type-Registry -# ============================================================ +# --------------------------------------------------------------------- +# Type-Registry (types.yaml) – Helper (robust, optional) +# --------------------------------------------------------------------- -def _env(name: str, default: Optional[str] = None) -> str: +def _env(name: str, default: Optional[str] = None) -> Optional[str]: v = os.getenv(name) - return v if v is not None else (default or "") + return v if v is not None else default -def _deep_get(root: Any, path: str) -> Any: - cur = root - for key in path.split("."): - if not isinstance(cur, dict) or key not in cur: - return None - cur = cur[key] - return cur - -def _as_float(x: Any) -> Optional[float]: +def _load_json_or_yaml(path: str) -> dict: + import io + data: dict = {} + if not path or not os.path.exists(path): + return data try: - return float(x) + import yaml # type: ignore + with io.open(path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + if not isinstance(data, dict): + return {} + return data except Exception: - return None + # YAML evtl. nicht installiert – versuche JSON + try: + with io.open(path, "r", encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict): + return {} + return data + except Exception: + return {} def load_type_registry() -> dict: - path = _env("MINDNET_TYPES_FILE", "./config/types.yaml") - if not os.path.isfile(path): - return {} - try: - with open(path, "r", encoding="utf-8") as f: - return yaml.safe_load(f) or {} - except Exception: - return {} + # Reihenfolge: ENV > ./types.yaml (im aktuellen Arbeitsverzeichnis) + p = _env("MINDNET_TYPES_FILE", None) + if p and os.path.exists(p): + return _load_json_or_yaml(p) + fallback = os.path.abspath("./config/types.yaml") if os.path.exists("./config/types.yaml") else os.path.abspath("./types.yaml") + return _load_json_or_yaml(fallback) def get_type_config(note_type: Optional[str], reg: dict) -> dict: - if not note_type or not isinstance(reg, dict): + if not reg or not isinstance(reg, dict): return {} types = reg.get("types", {}) if isinstance(reg.get("types"), dict) else {} - return types.get(note_type, {}) if isinstance(types, dict) else {} + if note_type and isinstance(note_type, str) and note_type in types: + return types[note_type] or {} + # Fallback: concept + return types.get("concept", {}) or {} def resolve_note_type(requested: Optional[str], reg: dict) -> str: if requested and isinstance(requested, str): return requested + # Fallback wenn nichts gesetzt ist types = reg.get("types", {}) if isinstance(reg.get("types"), dict) else {} return "concept" if "concept" in types else (requested or "concept") def effective_chunk_profile(note_type: str, reg: dict) -> Optional[str]: + """Resolve chunk_profile for type or from defaults/global. + Accepts symbolic profiles: short|medium|long|default. + """ cfg = get_type_config(note_type, reg) - prof = cfg.get("chunk_profile") - if isinstance(prof, str): + prof = (cfg.get("chunk_profile") if isinstance(cfg, dict) else None) + if isinstance(prof, str) and prof: return prof - return None + # defaults fallbacks + for key in ("defaults", "default", "global"): + dcfg = reg.get(key) if isinstance(reg, dict) else None + if isinstance(dcfg, dict): + dprof = dcfg.get("chunk_profile") + if isinstance(dprof, str) and dprof: + return dprof + return "default" -def effective_retriever_weight_from_registry(note_type: str, reg: dict) -> Tuple[Optional[float], Optional[str]]: - candidates = [ - f"types.{note_type}.retriever_weight", - f"types.{note_type}.retriever.weight", - f"types.{note_type}.retrieval.weight", - "defaults.retriever_weight", - "defaults.retriever.weight", - "global.retriever_weight", - "global.retriever.weight", - ] - for path in candidates: - val = _deep_get(reg, path) - v = _as_float(val) - if v is not None: - return v, path - return None, None - -def compute_effective_retriever_weight(fm: Dict[str, Any], note_type: str, reg: dict) -> Tuple[float, str]: - if fm.get("retriever_weight") is not None: - v = _as_float(fm.get("retriever_weight")) - if v is not None: - return v, "frontmatter.retriever_weight" - r, rpath = effective_retriever_weight_from_registry(note_type, reg) - if r is not None: - return float(r), f"types.yaml:{rpath}" - return 1.0, "default:1.0" +def effective_retriever_weight(note_type: str, reg: dict) -> Optional[float]: + """Resolve retriever_weight for type or defaults; returns float. + """ + cfg = get_type_config(note_type, reg) + w = (cfg.get("retriever_weight") if isinstance(cfg, dict) else None) + try: + if w is not None: + return float(w) + except Exception: + pass + # defaults fallbacks + for key in ("defaults", "default", "global"): + dcfg = reg.get(key) if isinstance(reg, dict) else None + if isinstance(dcfg, dict): + dw = dcfg.get("retriever_weight") + try: + if dw is not None: + return float(dw) + except Exception: + pass + return 1.0 -# ============================================================ -# Qdrant Helpers -# ============================================================ +# --------------------------------------------------------------------- +# Sonstige Helper +# --------------------------------------------------------------------- + +def iter_md(root: str) -> List[str]: + out: List[str] = [] + for dirpath, _, filenames in os.walk(root): + for fn in filenames: + if not fn.lower().endswith(".md"): + continue + p = os.path.join(dirpath, fn) + pn = p.replace("\\", "/") + if any(ex in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]): + continue + out.append(p) + return sorted(out) def collections(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" @@ -134,7 +206,13 @@ def collections(prefix: str) -> Tuple[str, str, str]: def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[Dict]: notes_col, _, _ = collections(prefix) f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - points, _ = client.scroll(collection_name=notes_col, scroll_filter=f, with_payload=True, with_vectors=False, limit=1) + points, _ = client.scroll( + collection_name=notes_col, + scroll_filter=f, # wichtig: scroll_filter (nicht: filter) + with_payload=True, + with_vectors=False, + limit=1, + ) if not points: return None return points[0].payload or {} @@ -144,7 +222,13 @@ def list_qdrant_note_ids(client, prefix: str) -> Set[str]: out: Set[str] = set() next_page = None while True: - pts, next_page = client.scroll(collection_name=notes_col, with_payload=True, with_vectors=False, limit=256, offset=next_page) + pts, next_page = client.scroll( + collection_name=notes_col, + with_payload=True, + with_vectors=False, + limit=256, + offset=next_page, + ) if not pts: break for p in pts: @@ -161,7 +245,11 @@ def purge_note_artifacts(client, prefix: str, note_id: str) -> None: filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) for col in (chunks_col, edges_col): try: - client.delete(collection_name=col, points_selector=rest.FilterSelector(filter=filt), wait=True) + client.delete( + collection_name=col, + points_selector=rest.FilterSelector(filter=filt), + wait=True + ) except Exception as e: print(json.dumps({"note_id": note_id, "warn": f"delete in {col} via filter failed: {e}"})) @@ -170,25 +258,40 @@ def delete_note_everywhere(client, prefix: str, note_id: str) -> None: filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) for col in (edges_col, chunks_col, notes_col): try: - client.delete(collection_name=col, points_selector=rest.FilterSelector(filter=filt), wait=True) + client.delete( + collection_name=col, + points_selector=rest.FilterSelector(filter=filt), + wait=True + ) except Exception as e: print(json.dumps({"note_id": note_id, "warn": f"delete in {col} failed: {e}"})) + +# --- Neu: Existenz-Checks für Artefakte (fehlertoleranter Rebuild) --- + def _has_any_point(client, collection: str, note_id: str) -> bool: + """Prüft, ob es mind. einen Punkt mit note_id in der Collection gibt.""" filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - pts, _ = client.scroll(collection_name=collection, scroll_filter=filt, with_payload=False, with_vectors=False, limit=1) + pts, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=False, + with_vectors=False, + limit=1, + ) return bool(pts) def artifacts_missing(client, prefix: str, note_id: str) -> Tuple[bool, bool]: + """Gibt (chunks_missing, edges_missing) zurück.""" _, chunks_col, edges_col = collections(prefix) chunks_missing = not _has_any_point(client, chunks_col, note_id) edges_missing = not _has_any_point(client, edges_col, note_id) return chunks_missing, edges_missing -# ============================================================ +# --------------------------------------------------------------------- # Main -# ============================================================ +# --------------------------------------------------------------------- def _resolve_mode(m: Optional[str]) -> str: m = (m or "body").strip().lower() @@ -197,30 +300,34 @@ def _resolve_mode(m: Optional[str]) -> str: def main() -> None: load_dotenv() - ap = argparse.ArgumentParser(prog="scripts.import_markdown", description="Importiert Markdown-Notizen in Qdrant (Notes/Chunks/Edges).") - ap.add_argument("--vault", required=True) - ap.add_argument("--only-path") - ap.add_argument("--apply", action="store_true") - ap.add_argument("--purge-before-upsert", action="store_true") - ap.add_argument("--force-replace", action="store_true") - ap.add_argument("--note-id") - ap.add_argument("--note-scope-refs", action="store_true") - ap.add_argument("--hash-mode") - ap.add_argument("--hash-source") - ap.add_argument("--hash-normalize") - ap.add_argument("--compare-text", action="store_true") - ap.add_argument("--baseline-modes", action="store_true") - ap.add_argument("--sync-deletes", action="store_true") - ap.add_argument("--prefix") + ap = argparse.ArgumentParser( + prog="scripts.import_markdown", + description="Importiert Markdown-Notizen in Qdrant (Notes/Chunks/Edges)." + ) + ap.add_argument("--vault", required=True, help="Pfad zum Vault (Ordner mit .md-Dateien)") + ap.add_argument("--only-path", help="Nur diese Datei verarbeiten (absolut oder relativ)") + ap.add_argument("--apply", action="store_true", help="Schreibt nach Qdrant (sonst Dry-Run)") + ap.add_argument("--purge-before-upsert", action="store_true", help="Alte Chunks/Edges der Note vorher löschen") + ap.add_argument("--force-replace", action="store_true", help="Note/Chunks/Edges unabhängig von Hash neu schreiben") + ap.add_argument("--note-id", help="Nur Notes mit dieser ID verarbeiten (Filter)") + ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope References/Backlinks erzeugen") + ap.add_argument("--hash-mode", help="body|frontmatter|full (Default body)") + ap.add_argument("--hash-source", help="parsed|raw (Default parsed)") + ap.add_argument("--hash-normalize", help="canonical|none (Default canonical)") + ap.add_argument("--compare-text", action="store_true", help="Parsed fulltext zusätzlich direkt vergleichen") + ap.add_argument("--baseline-modes", action="store_true", help="Fehlende Hash-Varianten still nachtragen (Notes)") + ap.add_argument("--sync-deletes", action="store_true", help="Qdrant->Vault Lösch-Sync (Dry-Run; mit --apply ausführen)") + ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") args = ap.parse_args() - mode = _resolve_mode(args.hash_mode) - src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed") - norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical") + mode = _resolve_mode(args.hash_mode) # body|frontmatter|full + src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed") # parsed|raw + norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical") # canonical|none note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false") == "true") note_scope_refs = args.note_scope_refs or note_scope_refs_env compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true") + # Qdrant cfg = QdrantConfig.from_env() if args.prefix: cfg.prefix = args.prefix.strip() @@ -228,30 +335,22 @@ def main() -> None: ensure_collections(client, cfg.prefix, cfg.dim) ensure_payload_indexes(client, cfg.prefix) + # Type-Registry laden (optional) reg = load_type_registry() - types_cfg_node = reg.get("types") if isinstance(reg, dict) else {} root = os.path.abspath(args.vault) - # Datei-Liste + # Dateiliste if args.only_path: - files = [os.path.abspath(args.only_path)] + only = os.path.abspath(args.only_path) + files = [only] else: - files: List[str] = [] - for dirpath, _, filenames in os.walk(root): - for fn in filenames: - if fn.lower().endswith(".md"): - p = os.path.join(dirpath, fn) - pn = p.replace("\\", "/") - if any(ex in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]): - continue - files.append(p) - files.sort() + files = iter_md(root) if not files: print("Keine Markdown-Dateien gefunden.", file=sys.stderr) sys.exit(2) - # Optional: Sync-Deletes + # Optional: Sync-Deletes vorab if args.sync_deletes: vault_note_ids: Set[str] = set() for path in files: @@ -267,15 +366,22 @@ def main() -> None: continue qdrant_note_ids = list_qdrant_note_ids(client, cfg.prefix) to_delete = sorted(qdrant_note_ids - vault_note_ids) - print(json.dumps({"action":"sync-deletes","prefix":cfg.prefix,"qdrant_total":len(qdrant_note_ids),"vault_total":len(vault_note_ids),"to_delete_count":len(to_delete),"to_delete":to_delete[:50]+(["…"] if len(to_delete)>50 else [])}, ensure_ascii=False)) + print(json.dumps({ + "action": "sync-deletes", + "prefix": cfg.prefix, + "qdrant_total": len(qdrant_note_ids), + "vault_total": len(vault_note_ids), + "to_delete_count": len(to_delete), + "to_delete": to_delete[:50] + (["…"] if len(to_delete) > 50 else []) + }, ensure_ascii=False)) if args.apply and to_delete: for nid in to_delete: - print(json.dumps({"action":"delete","note_id":nid,"decision":"apply"})) + print(json.dumps({"action": "delete", "note_id": nid, "decision": "apply"})) delete_note_everywhere(client, cfg.prefix, nid) key_current = f"{mode}:{src}:{norm}" - processed = 0 + processed = 0 for path in files: try: parsed = read_markdown(path) @@ -285,7 +391,7 @@ def main() -> None: print(json.dumps({"path": path, "error": f"read_markdown failed: {type(e).__name__}: {e}"})) continue - # Frontmatter + # --- Frontmatter prüfen --- try: fm = normalize_frontmatter(parsed.frontmatter) validate_required_frontmatter(fm) @@ -298,99 +404,94 @@ def main() -> None: processed += 1 - # Typ + Profile/Weight - note_type = resolve_note_type(fm.get("type"), reg) + # --- Type-Registry anwenden (chunk_profile / retriever_weight) --- + try: + note_type = resolve_note_type(fm.get("type"), reg) + except Exception: + note_type = (fm.get("type") or "concept") fm["type"] = note_type or fm.get("type") or "concept" + prof = effective_chunk_profile(note_type, reg) if prof: fm["chunk_profile"] = prof - rw, rw_source = compute_effective_retriever_weight(fm, note_type, reg) - fm["retriever_weight"] = rw - # Note-Payload + weight = effective_retriever_weight(note_type, reg) + if weight is not None: + try: + fm["retriever_weight"] = float(weight) + except Exception: + pass # falls FM string-inkonsistent ist + + # --- Payload aufbauen (inkl. Hashes) --- try: - note_pl = make_note_payload(parsed, vault_root=root, hash_mode=mode, hash_normalize=norm, hash_source=src, file_path=path) + note_pl = make_note_payload( + parsed, + vault_root=root, + hash_mode=mode, + hash_normalize=norm, + hash_source=src, + file_path=path, + ) except Exception as e: print(json.dumps({"path": path, "error": f"make_note_payload failed: {type(e).__name__}: {e}"})) continue - # Explizites Spiegeln: chunk_profile & retriever_weight in Note - if fm.get("chunk_profile") is not None: - note_pl["chunk_profile"] = fm.get("chunk_profile") - note_pl["retriever_weight"] = float(rw) if isinstance(rw, (int, float)) else 1.0 + if not note_pl.get("fulltext"): + note_pl["fulltext"] = getattr(parsed, "body", "") or "" + + # retriever_weight sicher in Note-Payload spiegeln (für spätere Filter) + if "retriever_weight" not in note_pl and fm.get("retriever_weight") is not None: + try: + note_pl["retriever_weight"] = float(fm.get("retriever_weight")) + except Exception: + pass note_id = note_pl.get("note_id") or fm.get("id") if not note_id: print(json.dumps({"path": path, "error": "Missing note_id after payload build"})) continue - # Bestehende Note in Qdrant (für Changed-Detektion) - old_payload = fetch_existing_note_payload(client, cfg.prefix, note_id) + # --- bestehenden Payload laden (zum Diff) --- + old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id) has_old = old_payload is not None + old_hashes = (old_payload or {}).get("hashes") or {} old_hash_exact = old_hashes.get(key_current) new_hash_exact = (note_pl.get("hashes") or {}).get(key_current) needs_baseline = (old_hash_exact is None) + hash_changed = (old_hash_exact is not None and new_hash_exact is not None and old_hash_exact != new_hash_exact) text_changed = False - # Optionaler Text-Vergleich (teuer) - # if compare_text: - # old_text = (old_payload or {}).get("fulltext") or "" - # new_text = note_pl.get("fulltext") or (getattr(parsed, "body", "") or "") - # text_changed = (old_text != new_text) + if compare_text: + old_text = (old_payload or {}).get("fulltext") or "" + new_text = note_pl.get("fulltext") or "" + text_changed = (old_text != new_text) changed = args.force_replace or (not has_old) or hash_changed or text_changed + do_baseline_only = (args.baseline_modes and has_old and needs_baseline and not changed) - # Feld-basierte Change-Erkennung (erzwingt Update, wenn Werte nicht stimmen) - if has_old: - if old_payload.get("retriever_weight") != note_pl.get("retriever_weight"): - changed = True - if old_payload.get("chunk_profile") != note_pl.get("chunk_profile"): - changed = True - - # Chunks + # --- Chunks + Embeddings vorbereiten --- try: body_text = getattr(parsed, "body", "") or "" chunks = assemble_chunks(fm["id"], body_text, fm.get("type", "concept")) - # WICHTIG: Note-Objekt mit verschachtelter FM + Registry an Chunk-Builder übergeben - chunk_note = { - "frontmatter": fm, - "id": fm.get("id"), - "type": fm.get("type"), - "title": fm.get("title"), - "path": note_pl.get("path") or path, - "note_id": note_pl.get("note_id"), - "tags": fm.get("tags"), - } - chunk_pls: List[Dict[str, Any]] = make_chunk_payloads( - chunk_note, - note_pl["path"], - chunks, - note_text=body_text, - types_cfg=(reg.get("types") if isinstance(reg, dict) and isinstance(reg.get("types"), dict) else reg if isinstance(reg, dict) else {}), - file_path=path, - ) + chunk_pls: List[Dict[str, Any]] = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"})) continue - # Explizites Spiegeln: retriever_weight & chunk_profile auf JEDEN Chunk - rwf = float(rw) if isinstance(rw, (int, float)) else 1.0 - cpv = fm.get("chunk_profile") - for i, pl in enumerate(chunk_pls): - if "index" not in pl: - pl["index"] = i - pl["ord"] = int(pl.get("index", i)) + 1 - if "retriever_weight" not in pl: - pl["retriever_weight"] = rwf - if cpv is not None: - pl["chunk_profile"] = cpv - # entferne alte Aliasse - for alias in ("chunk_num", "Chunk_Number"): - pl.pop(alias, None) + # retriever_weight auf Chunk-Payload spiegeln + if fm.get("retriever_weight") is not None: + try: + rw = float(fm.get("retriever_weight")) + for pl in chunk_pls: + # Feld nur setzen, wenn noch nicht vorhanden + if "retriever_weight" not in pl: + pl["retriever_weight"] = rw + except Exception: + pass - # Embeddings (optional) + # Embeddings (fallback: Nullvektoren) vecs: List[List[float]] = [[0.0] * int(cfg.dim) for _ in chunk_pls] if embed_texts and chunk_pls: try: @@ -399,61 +500,88 @@ def main() -> None: except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"})) - # Artefakte vorhanden? + # --- Fehlende Artefakte in Qdrant ermitteln --- chunks_missing, edges_missing = artifacts_missing(client, cfg.prefix, note_id) - # Edges + # --- Edges (robust) --- edges: List[Dict[str, Any]] = [] edges_failed = False - should_build_edges = (changed and True) or edges_missing # wenn Note geändert oder Kanten fehlen + should_build_edges = (changed and (not do_baseline_only)) or edges_missing if should_build_edges: try: - note_refs = note_pl.get("references") or "" - edges = build_edges_for_note(note_id, chunk_pls, note_refs, include_note_scope_refs=note_scope_refs) + note_refs = note_pl.get("references") or [] + edges = build_edges_for_note( + note_id, + chunk_pls, + note_level_references=note_refs, + include_note_scope_refs=note_scope_refs, + ) except Exception as e: edges_failed = True edges = [] print(json.dumps({"path": path, "note_id": note_id, "warn": f"build_edges_for_note failed, skipping edges: {type(e).__name__}: {e}"})) - # Summary - print(json.dumps({ + # --- Summary (stdout) --- + summary = { "note_id": note_id, "title": fm.get("title"), - "type": fm.get("type"), - "rw": rw, - "chunk_profile": fm.get("chunk_profile"), "chunks": len(chunk_pls), "edges": len(edges), + "edges_failed": edges_failed, "changed": changed, "chunks_missing": chunks_missing, "edges_missing": edges_missing, - "decision": ("apply" if args.apply else "dry-run"), + "needs_baseline_for_mode": needs_baseline, + "decision": ("baseline-only" if args.apply and do_baseline_only else + "apply" if args.apply and (changed or chunks_missing or edges_missing) else + "apply-skip-unchanged" if args.apply and not (changed or chunks_missing or edges_missing) else + "dry-run"), + "path": note_pl["path"], + "hash_mode": mode, + "hash_normalize": norm, + "hash_source": src, "prefix": cfg.prefix, - "path": note_pl["path"] - }, ensure_ascii=False)) + } + print(json.dumps(summary, ensure_ascii=False)) - # Writes + # --- Writes --- if not args.apply: continue - # Purge bei Änderungen - if args.purge_before_upsert and changed: + if do_baseline_only: + merged_hashes = {} + merged_hashes.update(old_hashes) + merged_hashes.update(note_pl.get("hashes") or {}) + if old_payload: + note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext")) + note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature")) + note_pl["hashes"] = merged_hashes + notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) + upsert_batch(client, notes_name, note_pts) + continue + + # Wenn nichts geändert und keine Artefakte fehlen → nichts zu tun + if not changed and not (chunks_missing or edges_missing): + continue + + # Purge nur bei echten Änderungen (unverändert + fehlende Artefakte ≠ Purge) + if args.purge_before_upsert and has_old and changed: try: purge_note_artifacts(client, cfg.prefix, note_id) except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"})) - # Note + # Note nur bei Änderungen neu schreiben if changed: notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) upsert_batch(client, notes_name, note_pts) - # Chunks (wenn geändert ODER vorher keine vorhanden) + # Chunks schreiben, wenn geändert ODER vorher fehlend if chunk_pls and (changed or chunks_missing): chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs) upsert_batch(client, chunks_name, chunk_pts) - # Edges (wenn geändert ODER vorher keine vorhanden) + # Edges schreiben, wenn vorhanden und (geändert ODER vorher fehlend) if edges and (changed or edges_missing): edges_name, edge_pts = points_for_edges(cfg.prefix, edges) upsert_batch(client, edges_name, edge_pts)