diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index fea52de..57988bb 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -3,42 +3,32 @@ """ scripts/import_markdown.py -Purpose -------- -Import Markdown notes from a vault into Qdrant with idempotent upserts. -This version fixes the issue where `retriever_weight` for *notes* did not -reflect the values from `types.yaml`. It does so by building the note payload -from a dict containing the normalized frontmatter, and by ensuring the -Type‑Registry is loaded via ENV (`MINDNET_TYPES_FILE`, default: ./config/types.yaml). +Zweck +----- +- Liest Markdown-Notizen aus einem Vault ein +- Erzeugt Note-Payload, Chunk-Payloads (+ optionale Embeddings) und Edges +- Schreibt alles idempotent in Qdrant (Notes, Chunks, Edges) +- Integriert eine optionale Type-Registry (types.yaml), um z. B. chunk_profile + und retriever_weight pro Notiz-Typ zu steuern. -Key behaviors -------------- -- Deterministic, idempotent upserts for notes / chunks / edges -- Optional embeddings for chunks -- Optional sync‑deletes (vault → Qdrant) -- Ensures collections and payload indices exist -- Honors `retriever_weight` and `chunk_profile` from types.yaml for both notes and chunks +Kompatibilität & Fixes +---------------------- +- Unterstützt sowohl app.core.derive_edges (bevorzugt) als auch app.core.edges als Fallback + → Aufruf erfolgt mit POSITIONSARGUMENTEN, damit alte Signaturen (note_level_refs vs. note_level_references) + nicht zu TypeError führen. +- `scroll_filter` wird für alle Scrolls verwendet (Qdrant >= 1.7.x). +- `--purge-before-upsert` entfernt alte Chunks/Edges einer Note, wenn sich die Note geändert hat. +- `retriever_weight` aus types.yaml bzw. Frontmatter wird in Note- und Chunk-Payload gespiegelt. +- Baseline-Hash-Strategie: hash_mode (body|frontmatter|full), hash_source (parsed|raw), hash_normalize (canonical|none). -CLI examples ------------- - # Apply + purge +Aufrufbeispiele +--------------- + # Import (Apply + Purge-Update) python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert --prefix "$COLLECTION_PREFIX" - # Sync-Deletes (dry-run then apply) - python3 -m scripts.import_markdown --vault ./vault --sync-deletes - python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply - -Environment ------------ -- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY -- COLLECTION_PREFIX (default: mindnet); overridable via --prefix -- VECTOR_DIM (default: 384) -- MINDNET_TYPES_FILE (default: ./config/types.yaml) -- MINDNET_NOTE_SCOPE_REFS=true|false (default: false) -- MINDNET_HASH_SOURCE=parsed|raw (default: parsed) -- MINDNET_HASH_NORMALIZE=canonical|none (default: canonical) + # Nur Sync-Deletes (wenn Dateien entfernt wurden) + python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply --prefix "$COLLECTION_PREFIX" """ - from __future__ import annotations import argparse @@ -50,19 +40,21 @@ from typing import Dict, List, Optional, Tuple, Any, Set from dotenv import load_dotenv from qdrant_client.http import models as rest -# --- Project imports --- +# --- Projekt-Imports --- from app.core.parser import ( read_markdown, normalize_frontmatter, validate_required_frontmatter, ) from app.core.note_payload import make_note_payload +from app.core.chunker import assemble_chunks from app.core.chunk_payload import make_chunk_payloads try: + # bevorzugt der robuste Builder from app.core.derive_edges import build_edges_for_note except Exception: # pragma: no cover + # Fallback auf die einfache Variante from app.core.edges import build_edges_for_note # type: ignore - from app.core.qdrant import ( QdrantConfig, get_client, @@ -76,162 +68,194 @@ from app.core.qdrant_points import ( upsert_batch, ) -# embeddings optional +# embeddings sind optional (z. B. beim Payload-Backfill) try: - from app.core.embed import embed_texts + from app.core.embed import embed_texts # optional except Exception: # pragma: no cover embed_texts = None - -# ---------------------- helpers ---------------------- +# ------------------------------------------------------------ +# Type-Registry (types.yaml) +# ------------------------------------------------------------ +import yaml def _env(name: str, default: Optional[str] = None) -> str: - v = os.getenv(name, default if default is not None else "") - return v if v is not None else "" + v = os.getenv(name) + return v if v is not None else (default or "") -def _resolve_mode(val: Optional[str]) -> str: - v = (val or _env("MINDNET_HASH_COMPARE", "body")).strip().lower() - return v if v in ("body","frontmatter","full") else "body" - -def _iter_md(root: str) -> List[str]: - files: List[str] = [] - for dirpath, _, filenames in os.walk(root): - for fn in filenames: - if fn.lower().endswith(".md"): - files.append(os.path.join(dirpath, fn)) - files.sort() - return files - -def _types_file_default() -> str: - # default to ./config/types.yaml inside project root - # run is expected from /home/llmadmin/mindnet - default = os.path.abspath("./config/types.yaml") - return _env("MINDNET_TYPES_FILE", default) - -def load_type_registry() -> Dict[str, Any]: - import yaml # local import - path = _types_file_default() +def load_type_registry() -> dict: + # ENV kann Pfad überschreiben; Standard: ./config/types.yaml + path = _env("MINDNET_TYPES_FILE", "./config/types.yaml") + if not os.path.isfile(path): + return {} try: with open(path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - t = data.get("types") or {} - return t if isinstance(t, dict) else {} + return yaml.safe_load(f) or {} except Exception: return {} -def resolve_note_type(note_type: Optional[str], reg: Dict[str, Any]) -> str: - if not note_type: - return "concept" - s = str(note_type).strip() - return s if s in reg else s # allow free types if not configured +def get_type_config(note_type: Optional[str], reg: dict) -> dict: + if not note_type or not isinstance(reg, dict): + return {} + types = reg.get("types", {}) if isinstance(reg.get("types"), dict) else {} + return types.get(note_type, {}) if isinstance(types, dict) else {} -def effective_chunk_profile(note_type: str, reg: Dict[str, Any]) -> Optional[str]: +def resolve_note_type(requested: Optional[str], reg: dict) -> str: + if requested and isinstance(requested, str): + return requested + types = reg.get("types", {}) if isinstance(reg.get("types"), dict) else {} + return "concept" if "concept" in types else (requested or "concept") + +def effective_chunk_profile(note_type: str, reg: dict) -> Optional[str]: + cfg = get_type_config(note_type, reg) + prof = cfg.get("chunk_profile") + if isinstance(prof, str): + return prof + return None + +def effective_retriever_weight(note_type: str, reg: dict) -> Optional[float]: + cfg = get_type_config(note_type, reg) + w = cfg.get("retriever_weight") try: - v = reg.get(note_type, {}).get("chunk_profile") - return str(v) if v is not None else None + return float(w) if w is not None else None except Exception: return None -def effective_retriever_weight(note_type: str, reg: Dict[str, Any]) -> Optional[float]: - try: - v = reg.get(note_type, {}).get("retriever_weight") - return float(v) if v is not None else None - except Exception: +# ------------------------------------------------------------ +# Sonstige Helper +# ------------------------------------------------------------ +def iter_md(root: str) -> List[str]: + out: List[str] = [] + for dirpath, _, filenames in os.walk(root): + for fn in filenames: + if not fn.lower().endswith(".md"): + continue + p = os.path.join(dirpath, fn) + pn = p.replace("\\", "/") + if any(ex in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]): + continue + out.append(p) + return sorted(out) + +def collections(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[Dict]: + notes_col, _, _ = collections(prefix) + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + points, _ = client.scroll( + collection_name=notes_col, + scroll_filter=f, # wichtig: scroll_filter (nicht: filter) + with_payload=True, + with_vectors=False, + limit=1, + ) + if not points: return None + return points[0].payload or {} def list_qdrant_note_ids(client, prefix: str) -> Set[str]: - """Collect all note_ids from Qdrant mindnet_notes payloads.""" - from qdrant_client import QdrantClient - notes = f"{prefix}_notes" + notes_col, _, _ = collections(prefix) out: Set[str] = set() - # scroll with page size - offset = None + next_page = None while True: - res = client.scroll(collection_name=notes, with_payload=True, with_vectors=False, limit=2048, offset=offset) - pts = getattr(res, "points", None) or res[0] # API compatibility - next_off = getattr(res, "next_page_offset", None) or res[1] + pts, next_page = client.scroll( + collection_name=notes_col, + with_payload=True, + with_vectors=False, + limit=256, + offset=next_page, + ) + if not pts: + break for p in pts: - pl = getattr(p, "payload", {}) or {} - nid = pl.get("note_id") or pl.get("id") + pl = p.payload or {} + nid = pl.get("note_id") if isinstance(nid, str): out.add(nid) - if not next_off: + if next_page is None: break - offset = next_off return out -def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[Dict[str, Any]]: - notes = f"{prefix}_notes" - flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - sr = client.scroll(collection_name=notes, with_payload=True, with_vectors=False, limit=1, scroll_filter=flt) - pts = getattr(sr, "points", None) or sr[0] - if not pts: - return None - return getattr(pts[0], "payload", None) or None - def purge_note_artifacts(client, prefix: str, note_id: str) -> None: - """Delete old chunks/edges for a note (idempotent).""" - chunks = f"{prefix}_chunks" - edges = f"{prefix}_edges" - flt_note = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - try: - client.delete_points(collection_name=chunks, points_selector=flt_note, wait=True) - except Exception: - client.delete(collection_name=chunks, points_selector=flt_note, wait=True) - - flt_src = rest.Filter(should=[ - rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), - rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)), - rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id)), - ]) - try: - client.delete_points(collection_name=edges, points_selector=flt_src, wait=True) - except Exception: - client.delete(collection_name=edges, points_selector=flt_src, wait=True) + _, chunks_col, edges_col = collections(prefix) + filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + for col in (chunks_col, edges_col): + try: + client.delete( + collection_name=col, + points_selector=rest.FilterSelector(filter=filt), + wait=True + ) + except Exception as e: + print(json.dumps({"note_id": note_id, "warn": f"delete in {col} via filter failed: {e}"})) def delete_note_everywhere(client, prefix: str, note_id: str) -> None: - """Delete note + artifacts (chunks/edges).""" - notes = f"{prefix}_notes" - purge_note_artifacts(client, prefix, note_id) - flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - try: - client.delete_points(collection_name=notes, points_selector=flt, wait=True) - except Exception: - client.delete(collection_name=notes, points_selector=flt, wait=True) + notes_col, chunks_col, edges_col = collections(prefix) + filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + for col in (edges_col, chunks_col, notes_col): + try: + client.delete( + collection_name=col, + points_selector=rest.FilterSelector(filter=filt), + wait=True + ) + except Exception as e: + print(json.dumps({"note_id": note_id, "warn": f"delete in {col} failed: {e}"})) +def _has_any_point(client, collection: str, note_id: str) -> bool: + filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + pts, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=False, + with_vectors=False, + limit=1, + ) + return bool(pts) -# ---------------------- main ---------------------- +def artifacts_missing(client, prefix: str, note_id: str) -> Tuple[bool, bool]: + _, chunks_col, edges_col = collections(prefix) + chunks_missing = not _has_any_point(client, chunks_col, note_id) + edges_missing = not _has_any_point(client, edges_col, note_id) + return chunks_missing, edges_missing + +# ------------------------------------------------------------ +# Main +# ------------------------------------------------------------ +def _resolve_mode(m: Optional[str]) -> str: + m = (m or "body").strip().lower() + return m if m in {"body", "frontmatter", "full"} else "body" def main() -> None: load_dotenv() - ap = argparse.ArgumentParser(description="Import Markdown notes into Qdrant (idempotent).") - ap.add_argument("--vault", required=True, help="Path to the vault (folder with .md files)") - ap.add_argument("--only-path", help="Process only this file (absolute or relative)") - ap.add_argument("--apply", action="store_true", help="Write to Qdrant (otherwise dry-run)") - ap.add_argument("--purge-before-upsert", action="store_true", help="Delete old chunks/edges for the note before upserting") - ap.add_argument("--force-replace", action="store_true", help="Replace note/chunks/edges regardless of hash changes") - ap.add_argument("--note-id", help="Process only notes with this id") - ap.add_argument("--note-scope-refs", action="store_true", help="Create note-scope references/backlinks") - ap.add_argument("--hash-mode", help="body|frontmatter|full (default body)") - ap.add_argument("--hash-source", help="parsed|raw (default parsed)") - ap.add_argument("--hash-normalize", help="canonical|none (default canonical)") - ap.add_argument("--compare-text", action="store_true", help="Additionally compare parsed fulltext") - ap.add_argument("--baseline-modes", action="store_true", help="Backfill missing hash variants silently (notes)") - ap.add_argument("--sync-deletes", action="store_true", help="Qdrant->Vault delete sync (dry-run; use with --apply to execute)") - ap.add_argument("--prefix", help="Collection prefix (overrides ENV COLLECTION_PREFIX)") + ap = argparse.ArgumentParser( + prog="scripts.import_markdown", + description="Importiert Markdown-Notizen in Qdrant (Notes/Chunks/Edges)." + ) + ap.add_argument("--vault", required=True, help="Pfad zum Vault (Ordner mit .md-Dateien)") + ap.add_argument("--only-path", help="Nur diese Datei verarbeiten (absolut oder relativ)") + ap.add_argument("--apply", action="store_true", help="Schreibt nach Qdrant (sonst Dry-Run)") + ap.add_argument("--purge-before-upsert", action="store_true", help="Alte Chunks/Edges der Note vorher löschen") + ap.add_argument("--force-replace", action="store_true", help="Note/Chunks/Edges unabhängig von Hash neu schreiben") + ap.add_argument("--note-id", help="Nur Notes mit dieser ID verarbeiten (Filter)") + ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope References/Backlinks erzeugen") + ap.add_argument("--hash-mode", help="body|frontmatter|full (Default body)") + ap.add_argument("--hash-source", help="parsed|raw (Default parsed)") + ap.add_argument("--hash-normalize", help="canonical|none (Default canonical)") + ap.add_argument("--compare-text", action="store_true", help="Parsed fulltext zusätzlich direkt vergleichen") + ap.add_argument("--baseline-modes", action="store_true", help="Fehlende Hash-Varianten still nachtragen (Notes)") + ap.add_argument("--sync-deletes", action="store_true", help="Qdrant->Vault Lösch-Sync (Dry-Run; mit --apply ausführen)") + ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") args = ap.parse_args() - # Ensure default types path if not provided via ENV - if not os.getenv("MINDNET_TYPES_FILE"): - os.environ["MINDNET_TYPES_FILE"] = _types_file_default() - mode = _resolve_mode(args.hash_mode) # body|frontmatter|full src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed") # parsed|raw norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical") # canonical|none - note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true") + note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false") == "true") note_scope_refs = args.note_scope_refs or note_scope_refs_env - compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false").strip().lower() == "true") + compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true") # Qdrant cfg = QdrantConfig.from_env() @@ -241,22 +265,22 @@ def main() -> None: ensure_collections(client, cfg.prefix, cfg.dim) ensure_payload_indexes(client, cfg.prefix) - # Type-Registry + # Type-Registry laden (optional) reg = load_type_registry() root = os.path.abspath(args.vault) - # File list + # Dateiliste if args.only_path: only = os.path.abspath(args.only_path) files = [only] else: - files = _iter_md(root) + files = iter_md(root) if not files: - print("No Markdown files found.", file=sys.stderr) + print("Keine Markdown-Dateien gefunden.", file=sys.stderr) sys.exit(2) - # Optional: Sync-Deletes (vault -> qdrant) + # Optional: Sync-Deletes vorab if args.sync_deletes: vault_note_ids: Set[str] = set() for path in files: @@ -285,8 +309,9 @@ def main() -> None: print(json.dumps({"action": "delete", "note_id": nid, "decision": "apply"})) delete_note_everywhere(client, cfg.prefix, nid) - processed = 0 + key_current = f"{mode}:{src}:{norm}" + processed = 0 for path in files: try: parsed = read_markdown(path) @@ -296,7 +321,7 @@ def main() -> None: print(json.dumps({"path": path, "error": f"read_markdown failed: {type(e).__name__}: {e}"})) continue - # Frontmatter + # --- Frontmatter prüfen --- try: fm = normalize_frontmatter(parsed.frontmatter) validate_required_frontmatter(fm) @@ -309,7 +334,7 @@ def main() -> None: processed += 1 - # Apply type-registry to FM + # --- Type-Registry anwenden (chunk_profile / retriever_weight) --- try: note_type = resolve_note_type(fm.get("type"), reg) except Exception: @@ -327,17 +352,16 @@ def main() -> None: except Exception: pass - # --- Build NOTE payload (IMPORTANT: build from dict to capture FM overrides) --- + # --- Payload aufbauen (inkl. Hashes) --- try: - note_input = { - "frontmatter": fm, - "id": fm.get("id"), - "title": fm.get("title"), - "type": fm.get("type"), - "path": path, - "body": getattr(parsed, "body", "") or "", - } - note_pl = make_note_payload(note_input, file_path=path) + note_pl = make_note_payload( + parsed, + vault_root=root, + hash_mode=mode, + hash_normalize=norm, + hash_source=src, + file_path=path, + ) except Exception as e: print(json.dumps({"path": path, "error": f"make_note_payload failed: {type(e).__name__}: {e}"})) continue @@ -345,7 +369,7 @@ def main() -> None: if not note_pl.get("fulltext"): note_pl["fulltext"] = getattr(parsed, "body", "") or "" - # Ensure retriever_weight is present on note payload (from FM/types) + # retriever_weight sicher in Note-Payload spiegeln if "retriever_weight" not in note_pl and fm.get("retriever_weight") is not None: try: note_pl["retriever_weight"] = float(fm.get("retriever_weight")) @@ -357,74 +381,91 @@ def main() -> None: print(json.dumps({"path": path, "error": "Missing note_id after payload build"})) continue - # Compare against existing payload to detect changes + # --- bestehenden Payload laden (zum Diff) --- old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id) has_old = old_payload is not None - old_text = (old_payload or {}).get("fulltext") or "" - new_text = note_pl.get("fulltext") or "" - text_changed = (old_text != new_text) + old_hashes = (old_payload or {}).get("hashes") or {} + old_hash_exact = old_hashes.get(key_current) + new_hash_exact = (note_pl.get("hashes") or {}).get(key_current) + needs_baseline = (old_hash_exact is None) - changed = args.force_replace or (not has_old) or text_changed + hash_changed = (old_hash_exact is not None and new_hash_exact is not None and old_hash_exact != new_hash_exact) - # --- CHUNKS --- + text_changed = False + if compare_text: + old_text = (old_payload or {}).get("fulltext") or "" + new_text = note_pl.get("fulltext") or "" + text_changed = (old_text != new_text) + + changed = args.force_replace or (not has_old) or hash_changed or text_changed + do_baseline_only = (args.baseline_modes and has_old and needs_baseline and not changed) + + # --- Chunks + Embeddings vorbereiten --- try: - # Make chunk payloads from the same note dict; chunker will honor FM profile - chunk_note = { - "frontmatter": fm, - "id": fm.get("id"), - "title": fm.get("title"), - "type": fm.get("type"), - "path": path, - "body": getattr(parsed, "body", "") or "", - } - chunk_pls: List[Dict[str, Any]] = make_chunk_payloads(chunk_note, file_path=path) + body_text = getattr(parsed, "body", "") or "" + chunks = assemble_chunks(fm["id"], body_text, fm.get("type", "concept")) + chunk_pls: List[Dict[str, Any]] = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"})) continue - # embeddings (optional) - vecs = None - if embed_texts: + # retriever_weight auf Chunk-Payload spiegeln + if fm.get("retriever_weight") is not None: try: - texts = [c.get("window") or c.get("text") or "" for c in chunk_pls] - vecs = embed_texts(texts) if texts else None + rw = float(fm.get("retriever_weight")) + for pl in chunk_pls: + if "retriever_weight" not in pl: + pl["retriever_weight"] = rw + except Exception: + pass + + # Embeddings (fallback: Nullvektoren) + vecs: List[List[float]] = [[0.0] * int(cfg.dim) for _ in chunk_pls] + if embed_texts and chunk_pls: + try: + texts_for_embed = [(pl.get("window") or pl.get("text") or "") for pl in chunk_pls] + vecs = embed_texts(texts_for_embed) except Exception as e: - print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed failed: {e}"})) + print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"})) - # --- EDGES --- + # --- Fehlende Artefakte in Qdrant ermitteln --- + chunks_missing, edges_missing = artifacts_missing(client, cfg.prefix, note_id) + + # --- Edges (robust) --- edges: List[Dict[str, Any]] = [] - try: - include_note_scope = bool(note_scope_refs) - edges = build_edges_for_note(note_id, chunk_pls, None, include_note_scope) - except Exception as e: - print(json.dumps({"path": path, "note_id": note_id, "warn": f"edges failed: {e}"})) - - # Check missing artifacts when unchanged - chunks_missing = False - edges_missing = False - if has_old and not changed: - # best-effort existence checks + edges_failed = False + should_build_edges = (changed and (not do_baseline_only)) or edges_missing + if should_build_edges: try: - # If at least one chunk for note_id exists → assume not missing - chunks_missing = False - except Exception: - chunks_missing = True - try: - edges_missing = False - except Exception: - edges_missing = True + # ACHTUNG: POSITIONSARGUMENTE benutzen → kompatibel zu edges.py UND derive_edges.py + note_refs = note_pl.get("references") or [] + edges = build_edges_for_note(note_id, chunk_pls, note_refs, include_note_scope_refs=note_scope_refs) + except Exception as e: + edges_failed = True + edges = [] + print(json.dumps({"path": path, "note_id": note_id, "warn": f"build_edges_for_note failed, skipping edges: {type(e).__name__}: {e}"})) - # --- Summary (dry-run log) --- + # --- Summary (stdout) --- summary = { "note_id": note_id, "title": fm.get("title"), - "type": fm.get("type"), - "path": path, - "changed": changed, "chunks": len(chunk_pls), "edges": len(edges), - "apply": bool(args.apply), + "edges_failed": edges_failed, + "changed": changed, + "chunks_missing": chunks_missing, + "edges_missing": edges_missing, + "needs_baseline_for_mode": needs_baseline, + "decision": ("baseline-only" if args.apply and do_baseline_only else + "apply" if args.apply and (changed or chunks_missing or edges_missing) else + "apply-skip-unchanged" if args.apply and not (changed or chunks_missing or edges_missing) else + "dry-run"), + "path": note_pl["path"], + "hash_mode": mode, + "hash_normalize": norm, + "hash_source": src, + "prefix": cfg.prefix, } print(json.dumps(summary, ensure_ascii=False)) @@ -432,24 +473,40 @@ def main() -> None: if not args.apply: continue - # purge artifacts if requested and we indeed change the note + if do_baseline_only: + merged_hashes = {} + merged_hashes.update(old_hashes) + merged_hashes.update(note_pl.get("hashes") or {}) + if old_payload: + note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext")) + note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature")) + note_pl["hashes"] = merged_hashes + notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) + upsert_batch(client, notes_name, note_pts) + continue + + # Wenn nichts geändert und keine Artefakte fehlen → nichts zu tun + if not changed and not (chunks_missing or edges_missing): + continue + + # Purge nur bei echten Änderungen if args.purge_before_upsert and has_old and changed: try: purge_note_artifacts(client, cfg.prefix, note_id) except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"})) - # write note when changed or not exists - if changed or not has_old: + # Note nur bei Änderungen neu schreiben + if changed: notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) upsert_batch(client, notes_name, note_pts) - # write chunks when changed or previously missing + # Chunks schreiben, wenn geändert ODER vorher fehlend if chunk_pls and (changed or chunks_missing): chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs) upsert_batch(client, chunks_name, chunk_pts) - # write edges when available + # Edges schreiben, wenn vorhanden und (geändert ODER vorher fehlend) if edges and (changed or edges_missing): edges_name, edge_pts = points_for_edges(cfg.prefix, edges) upsert_batch(client, edges_name, edge_pts)