scripts/health_check_mindnet.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
311e598b69
commit
b29ce4a0a6
|
|
@ -2,27 +2,46 @@
|
|||
"""
|
||||
scripts/health_check_mindnet.py
|
||||
|
||||
Ein einfacher Health-Check für den mindnet-Retriever/Query-Endpoint.
|
||||
Kann z. B. über Cron, systemd oder n8n (SSH / Command-Node) ausgeführt werden.
|
||||
Health-Check für den mindnet-Retriever-/Query-Endpoint.
|
||||
|
||||
Funktion:
|
||||
- POST auf /query in den Modi "semantic" und "hybrid"
|
||||
- prüft HTTP-Status, JSON-Struktur, Anzahl Treffer
|
||||
- gibt eine kompakte JSON-Ausgabe zurück
|
||||
- Exit-Code 0 = OK, 1 = Fehler
|
||||
- Führt POST-Requests auf /query in verschiedenen Modi aus (standard: semantic + hybrid).
|
||||
- Prüft Status-Code, JSON-Struktur und Anzahl der Treffer.
|
||||
- Kennzeichnet Probleme als:
|
||||
- status="ok"
|
||||
- status="warning" (z.B. Timeout)
|
||||
- status="error" (harte Fehler wie HTTP-Fehler, JSON-Fehler etc.)
|
||||
|
||||
Beispiel:
|
||||
Exit-Code:
|
||||
- Default (tolerant):
|
||||
- overall_status = "ok" (inkl. warnings) → Exit-Code 0
|
||||
- overall_status = "error" → Exit-Code 1
|
||||
- Mit --strict:
|
||||
- warnings werden wie errors behandelt → Exit-Code 1
|
||||
|
||||
Beispiele:
|
||||
|
||||
python scripts/health_check_mindnet.py \
|
||||
--url http://127.0.0.1:8001/query \
|
||||
--query "embeddings" \
|
||||
--top-k 3
|
||||
--url http://127.0.0.1:8001/query \
|
||||
--query "embeddings" \
|
||||
--top-k 3
|
||||
|
||||
python scripts/health_check_mindnet.py \
|
||||
--url http://127.0.0.1:8001/query \
|
||||
--query "embeddings" \
|
||||
--top-k 3 \
|
||||
--timeout 15 \
|
||||
--modes hybrid
|
||||
|
||||
# Strenger Modus (warnings → Exit-Code 1)
|
||||
python scripts/health_check_mindnet.py --strict
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
|
|
@ -38,16 +57,15 @@ def _post_query(
|
|||
timeout: float,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Führt einen POST auf den /query-Endpoint aus und gibt das decodierte JSON zurück.
|
||||
Führt einen POST auf den /query-Endpoint aus und gibt das dekodierte JSON zurück.
|
||||
|
||||
Raises:
|
||||
urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError
|
||||
urllib.error.URLError, urllib.error.HTTPError, RuntimeError
|
||||
"""
|
||||
payload = {
|
||||
"mode": mode,
|
||||
"query": query,
|
||||
"top_k": top_k,
|
||||
# Minimal-Expand, damit der Hybridmodus nichts "Exotisches" braucht.
|
||||
"expand": {
|
||||
"depth": 1 if mode == "hybrid" else 0,
|
||||
"edge_types": ["references", "belongs_to", "prev", "next"],
|
||||
|
|
@ -98,12 +116,15 @@ def _validate_response(
|
|||
"""
|
||||
Prüft die wichtigsten Invarianten des Query-Responses.
|
||||
|
||||
Gibt ein Result-Dict zurück mit:
|
||||
- status: "ok" oder "error"
|
||||
- message: str
|
||||
- latency_ms: int
|
||||
- used_mode: str
|
||||
- result_count: int
|
||||
Rückgabe-Format:
|
||||
|
||||
{
|
||||
"status": "ok" | "warning" | "error",
|
||||
"message": str,
|
||||
"latency_ms": int,
|
||||
"used_mode": str,
|
||||
"result_count": int
|
||||
}
|
||||
"""
|
||||
latency_ms = int(doc.get("_latency_ms", -1))
|
||||
used_mode = str(doc.get("used_mode", "unknown"))
|
||||
|
|
@ -121,6 +142,8 @@ def _validate_response(
|
|||
result_count = len(results)
|
||||
|
||||
if result_count < min_results:
|
||||
# Zu wenige Ergebnisse sind aus Sicht eines Health-Checks eher ein Error,
|
||||
# da die Retrieval-Qualität nicht gewährleistet ist.
|
||||
return {
|
||||
"status": "error",
|
||||
"message": f"zu wenige Ergebnisse: {result_count} < {min_results}",
|
||||
|
|
@ -129,19 +152,18 @@ def _validate_response(
|
|||
"result_count": result_count,
|
||||
}
|
||||
|
||||
# Optional: ein paar Felder im ersten Hit prüfen
|
||||
sample_msg = "OK"
|
||||
if result_count > 0:
|
||||
first = results[0]
|
||||
if not isinstance(first, dict):
|
||||
sample_msg = "first result ist kein Objekt"
|
||||
else:
|
||||
# einfache Plausibilitätschecks
|
||||
# einfache Plausibilitätschecks (nur Zugriff, kein strikter Typcheck)
|
||||
_ = first.get("note_id")
|
||||
_ = first.get("chunk_id")
|
||||
_ = first.get("total_score")
|
||||
|
||||
status = "ok" if sample_msg == "OK" else "error"
|
||||
status = "ok" if sample_msg == "OK" else "warning"
|
||||
return {
|
||||
"status": status,
|
||||
"message": sample_msg,
|
||||
|
|
@ -168,9 +190,11 @@ def run_health_check(
|
|||
"top_k": top_k,
|
||||
"timeout_s": timeout,
|
||||
"checks": [],
|
||||
"overall_status": "ok",
|
||||
"overall_status": "ok", # wird unten ggf. auf warning oder error gesetzt
|
||||
}
|
||||
|
||||
overall_status = "ok"
|
||||
|
||||
for mode in modes:
|
||||
entry: Dict[str, Any] = {
|
||||
"mode": mode,
|
||||
|
|
@ -181,12 +205,36 @@ def run_health_check(
|
|||
doc = _post_query(url=url, mode=mode, query=query, top_k=top_k, timeout=timeout)
|
||||
validation = _validate_response(mode=mode, doc=doc, min_results=min_results)
|
||||
entry.update(validation)
|
||||
except (urllib.error.URLError, urllib.error.HTTPError) as exc:
|
||||
|
||||
except urllib.error.HTTPError as exc:
|
||||
entry["status"] = "error"
|
||||
entry["message"] = f"HTTP/Netzwerkfehler: {exc}"
|
||||
entry["message"] = f"HTTP-Fehler: {exc}"
|
||||
entry["latency_ms"] = -1
|
||||
entry["used_mode"] = mode
|
||||
entry["result_count"] = -1
|
||||
|
||||
except urllib.error.URLError as exc:
|
||||
# URLError kann u.a. socket.timeout enthalten.
|
||||
lat = -1
|
||||
entry["latency_ms"] = lat
|
||||
entry["used_mode"] = mode
|
||||
entry["result_count"] = -1
|
||||
|
||||
if isinstance(exc.reason, socket.timeout):
|
||||
entry["status"] = "warning"
|
||||
entry["message"] = f"Timeout (URLError/socket.timeout): {exc}"
|
||||
else:
|
||||
entry["status"] = "error"
|
||||
entry["message"] = f"HTTP/Netzwerkfehler: {exc}"
|
||||
|
||||
except socket.timeout as exc:
|
||||
# Direkter Timeout (falls nicht in URLError verpackt)
|
||||
entry["status"] = "warning"
|
||||
entry["message"] = f"Timeout (socket.timeout): {exc}"
|
||||
entry["latency_ms"] = -1
|
||||
entry["used_mode"] = mode
|
||||
entry["result_count"] = -1
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
entry["status"] = "error"
|
||||
entry["message"] = f"unerwarteter Fehler: {exc}"
|
||||
|
|
@ -194,14 +242,16 @@ def run_health_check(
|
|||
entry["used_mode"] = mode
|
||||
entry["result_count"] = -1
|
||||
|
||||
# overall_status bestimmen:
|
||||
# - error > warning > ok
|
||||
if entry["status"] == "error":
|
||||
overall_status = "error"
|
||||
elif entry["status"] == "warning" and overall_status == "ok":
|
||||
overall_status = "warning"
|
||||
|
||||
report["checks"].append(entry)
|
||||
|
||||
# Overall-Status bestimmen
|
||||
for c in report["checks"]:
|
||||
if c.get("status") != "ok":
|
||||
report["overall_status"] = "error"
|
||||
break
|
||||
|
||||
report["overall_status"] = overall_status
|
||||
return report
|
||||
|
||||
|
||||
|
|
@ -243,6 +293,11 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
|
|||
default=1,
|
||||
help="Minimale Anzahl erwarteter Ergebnisse (Default: %(default)s)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strict",
|
||||
action="store_true",
|
||||
help="Warnings als Fehler behandeln (overall_status=warning führt zu Exit-Code 1)",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
|
|
@ -260,7 +315,7 @@ def main(argv: List[str]) -> int:
|
|||
|
||||
overall = report.get("overall_status", "error")
|
||||
|
||||
# Kurze menschenlesbare Zusammenfassung:
|
||||
# Menschlich lesbare Zusammenfassung:
|
||||
print(f"mindnet health: {overall}")
|
||||
for c in report["checks"]:
|
||||
mode = c.get("mode")
|
||||
|
|
@ -270,10 +325,15 @@ def main(argv: List[str]) -> int:
|
|||
msg = c.get("message", "")
|
||||
print(f" - {mode}: {status} (latency={latency} ms, results={result_count}) {msg}")
|
||||
|
||||
# JSON-Ausgabe für n8n / Maschinen:
|
||||
# JSON-Ausgabe (z.B. für n8n):
|
||||
print(json.dumps(report, ensure_ascii=False))
|
||||
|
||||
return 0 if overall == "ok" else 1
|
||||
if args.strict:
|
||||
# strict: warning wird wie error behandelt
|
||||
return 0 if overall == "ok" else 1
|
||||
else:
|
||||
# tolerant: nur echte errors führen zu Exit-Code 1
|
||||
return 0 if overall in ("ok", "warning") else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user