diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 2959896..b1288df 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -243,6 +243,28 @@ def delete_note_everywhere(client, prefix: str, note_id: str) -> None: print(json.dumps({"note_id": note_id, "warn": f"delete in {col} failed: {e}"})) +# --- Neu: Existenz-Checks für Artefakte (fehlertoleranter Rebuild) --- + +def _has_any_point(client, collection: str, note_id: str) -> bool: + """Prüft, ob es mind. einen Punkt mit note_id in der Collection gibt.""" + filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + pts, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=False, + with_vectors=False, + limit=1, + ) + return bool(pts) + +def artifacts_missing(client, prefix: str, note_id: str) -> Tuple[bool, bool]: + """Gibt (chunks_missing, edges_missing) zurück.""" + _, chunks_col, edges_col = collections(prefix) + chunks_missing = not _has_any_point(client, chunks_col, note_id) + edges_missing = not _has_any_point(client, edges_col, note_id) + return chunks_missing, edges_missing + + # --------------------------------------------------------------------- # Main # --------------------------------------------------------------------- @@ -425,7 +447,7 @@ def main() -> None: changed = args.force_replace or (not has_old) or hash_changed or text_changed do_baseline_only = (args.baseline_modes and has_old and needs_baseline and not changed) - # --- Chunks + Embeddings --- + # --- Chunks + Embeddings vorbereiten --- try: body_text = getattr(parsed, "body", "") or "" chunks = assemble_chunks(fm["id"], body_text, fm.get("type", "concept")) @@ -454,10 +476,14 @@ def main() -> None: except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"})) + # --- Fehlende Artefakte in Qdrant ermitteln --- + chunks_missing, edges_missing = artifacts_missing(client, cfg.prefix, note_id) + # --- Edges (robust) --- edges: List[Dict[str, Any]] = [] edges_failed = False - if changed and (not do_baseline_only): + should_build_edges = (changed and (not do_baseline_only)) or edges_missing + if should_build_edges: try: note_refs = note_pl.get("references") or [] edges = build_edges_for_note( @@ -479,10 +505,12 @@ def main() -> None: "edges": len(edges), "edges_failed": edges_failed, "changed": changed, + "chunks_missing": chunks_missing, + "edges_missing": edges_missing, "needs_baseline_for_mode": needs_baseline, "decision": ("baseline-only" if args.apply and do_baseline_only else - "apply" if args.apply and changed else - "apply-skip-unchanged" if args.apply and not changed else + "apply" if args.apply and (changed or chunks_missing or edges_missing) else + "apply-skip-unchanged" if args.apply and not (changed or chunks_missing or edges_missing) else "dry-run"), "path": note_pl["path"], "hash_mode": mode, @@ -508,21 +536,29 @@ def main() -> None: upsert_batch(client, notes_name, note_pts) continue - if not changed: + # Wenn nichts geändert und keine Artefakte fehlen → nichts zu tun + if not changed and not (chunks_missing or edges_missing): continue - if args.purge_before_upsert and has_old: + # Purge nur bei echten Änderungen (unverändert + fehlende Artefakte ≠ Purge) + if args.purge_before_upsert and has_old and changed: try: purge_note_artifacts(client, cfg.prefix, note_id) except Exception as e: print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"})) - notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) - upsert_batch(client, notes_name, note_pts) - if chunk_pls: + # Note nur bei Änderungen neu schreiben + if changed: + notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) + upsert_batch(client, notes_name, note_pts) + + # Chunks schreiben, wenn geändert ODER vorher fehlend + if chunk_pls and (changed or chunks_missing): chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs) upsert_batch(client, chunks_name, chunk_pts) - if edges: + + # Edges schreiben, wenn vorhanden und (geändert ODER vorher fehlend) + if edges and (changed or edges_missing): edges_name, edge_pts = points_for_edges(cfg.prefix, edges) upsert_batch(client, edges_name, edge_pts)