From 75f13af1f147ddab5615933bd36ff36ca64c862e Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 10 Aug 2025 08:06:10 +0200 Subject: [PATCH] init: add knowledge, llm-api, scripts + repo settings --- .editorconfig | 11 + .gitattributes | 11 + .gitignore | 35 ++ knowledge/karatetrainer/_imported/Lateral.txt | 5 + .../_imported/example_martial_arts.pdf | 68 +++ knowledge/karatetrainer/_imported/hikete.txt | 1 + .../karatetrainer/_imported/mae-geri.txt | 1 + knowledge/karatetrainer/_imported/quatsch.txt | 1 + .../_imported/test_karate_chudan_zuki.txt | 12 + .../_imported/test_multiple_chunks.txt | 19 + .../_imported/test_para_chunks.txt | 7 + knowledge/karatetrainer/_imported/zuki.txt | 1 + llm-api/archiv/app.py | 110 +++++ llm-api/archiv/llm_apiV20.py | 167 +++++++ llm-api/archiv/llm_api_bk.py | 196 ++++++++ llm-api/archiv/llm_api_bk2.py | 196 ++++++++ llm-api/archiv/llm_api_v1.py | 220 +++++++++ llm-api/clients.py | 24 + llm-api/embed_router.py | 126 ++++++ llm-api/embed_router1.1.0.py | 126 ++++++ llm-api/exercise_router.py | 181 ++++++++ llm-api/llm_api.py | 33 ++ llm-api/old strukture/llm_api.py | 421 ++++++++++++++++++ llm-api/old strukture/llm_api1.1.1.py | 319 +++++++++++++ llm-api/old strukture/llm_api1.1.11.py | 323 ++++++++++++++ llm-api/old strukture/llm_api1.1.5.py | 341 ++++++++++++++ llm-api/old strukture/llm_api1.1.6.py | 421 ++++++++++++++++++ llm-api/test_delete_filters.sh | 173 +++++++ llm-api/test_exercise_idempotent.sh | 71 +++ llm-api/test_exercise_plan.sh | 155 +++++++ llm-api/test_llm_api.sh | 79 ++++ llm-api/test_llm_api_full.sh | 90 ++++ llm-api/test_llm_api_wiki.sh | 51 +++ llm-api/test_wiki_ep1.sh | 24 + llm-api/test_wiki_exercises.sh | 41 ++ llm-api/test_wiki_router.sh | 110 +++++ llm-api/uvicorn.log | 10 + llm-api/wiki_cookies.txt | 4 + llm-api/wiki_router.py | 173 +++++++ llm-api/wiki_router0.1.1.py | 110 +++++ llm-api/wiki_router0.1.2.py | 121 +++++ llm-api/wiki_router1.1.5.py | 168 +++++++ llm-api/wiki_router1.1.6.py | 164 +++++++ llm-api/wiki_router1.1.7.py | 165 +++++++ llm-api/wiki_router1.1.9.py | 172 +++++++ llm-api/wiki_router1.2.0.py | 173 +++++++ scripts/archiv/chunker_utils.py | 34 ++ scripts/archiv/import_documents.py | 93 ++++ scripts/archiv/import_textfile.py | 61 +++ scripts/archiv/import_texts.py | 31 ++ scripts/archiv/index_documents.py | 66 +++ scripts/archiv/text_chunker.py | 46 ++ scripts/chunking_utils.py | 155 +++++++ scripts/import_folder_txt.py | 61 +++ scripts/import_pdf_file.py | 92 ++++ scripts/import_single_file.py | 85 ++++ scripts/import_txtdocuments.py | 135 ++++++ scripts/prompt_documents.py | 32 ++ scripts/restore_imported_files.py | 45 ++ scripts/restore_single_file.py | 36 ++ scripts/search_documents.py | 31 ++ scripts/wiki_importer.py | 238 ++++++++++ scripts/wiki_importer1.1.0.py | 117 +++++ scripts/wiki_importer2.0.0.py | 136 ++++++ 64 files changed, 6924 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 knowledge/karatetrainer/_imported/Lateral.txt create mode 100644 knowledge/karatetrainer/_imported/example_martial_arts.pdf create mode 100644 knowledge/karatetrainer/_imported/hikete.txt create mode 100644 knowledge/karatetrainer/_imported/mae-geri.txt create mode 100644 knowledge/karatetrainer/_imported/quatsch.txt create mode 100644 knowledge/karatetrainer/_imported/test_karate_chudan_zuki.txt create mode 100644 knowledge/karatetrainer/_imported/test_multiple_chunks.txt create mode 100644 knowledge/karatetrainer/_imported/test_para_chunks.txt create mode 100644 knowledge/karatetrainer/_imported/zuki.txt create mode 100644 llm-api/archiv/app.py create mode 100644 llm-api/archiv/llm_apiV20.py create mode 100644 llm-api/archiv/llm_api_bk.py create mode 100644 llm-api/archiv/llm_api_bk2.py create mode 100644 llm-api/archiv/llm_api_v1.py create mode 100644 llm-api/clients.py create mode 100644 llm-api/embed_router.py create mode 100644 llm-api/embed_router1.1.0.py create mode 100644 llm-api/exercise_router.py create mode 100644 llm-api/llm_api.py create mode 100644 llm-api/old strukture/llm_api.py create mode 100644 llm-api/old strukture/llm_api1.1.1.py create mode 100644 llm-api/old strukture/llm_api1.1.11.py create mode 100644 llm-api/old strukture/llm_api1.1.5.py create mode 100644 llm-api/old strukture/llm_api1.1.6.py create mode 100755 llm-api/test_delete_filters.sh create mode 100755 llm-api/test_exercise_idempotent.sh create mode 100755 llm-api/test_exercise_plan.sh create mode 100755 llm-api/test_llm_api.sh create mode 100755 llm-api/test_llm_api_full.sh create mode 100755 llm-api/test_llm_api_wiki.sh create mode 100755 llm-api/test_wiki_ep1.sh create mode 100755 llm-api/test_wiki_exercises.sh create mode 100755 llm-api/test_wiki_router.sh create mode 100644 llm-api/uvicorn.log create mode 100644 llm-api/wiki_cookies.txt create mode 100644 llm-api/wiki_router.py create mode 100644 llm-api/wiki_router0.1.1.py create mode 100644 llm-api/wiki_router0.1.2.py create mode 100644 llm-api/wiki_router1.1.5.py create mode 100644 llm-api/wiki_router1.1.6.py create mode 100644 llm-api/wiki_router1.1.7.py create mode 100644 llm-api/wiki_router1.1.9.py create mode 100644 llm-api/wiki_router1.2.0.py create mode 100644 scripts/archiv/chunker_utils.py create mode 100644 scripts/archiv/import_documents.py create mode 100644 scripts/archiv/import_textfile.py create mode 100644 scripts/archiv/import_texts.py create mode 100644 scripts/archiv/index_documents.py create mode 100644 scripts/archiv/text_chunker.py create mode 100644 scripts/chunking_utils.py create mode 100644 scripts/import_folder_txt.py create mode 100644 scripts/import_pdf_file.py create mode 100644 scripts/import_single_file.py create mode 100644 scripts/import_txtdocuments.py create mode 100644 scripts/prompt_documents.py create mode 100644 scripts/restore_imported_files.py create mode 100644 scripts/restore_single_file.py create mode 100644 scripts/search_documents.py create mode 100644 scripts/wiki_importer.py create mode 100644 scripts/wiki_importer1.1.0.py create mode 100644 scripts/wiki_importer2.0.0.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..e1a5a84 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{py,sh,yml}] +indent_style = space +indent_size = 4 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f4b6b48 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +* text=auto eol=lf +*.sh text eol=lf +*.py text eol=lf +*.yml text eol=lf +*.json text eol=lf +*.md text eol=lf +*.bat text eol=crlf + +*.png binary +*.jpg binary +*.pdf binary diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d70bee9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +*.egg-info/ +.python-version + +# Virtualenvs +venv/ +.venv/ +venvs/ +*/venv/ +*/.venv/ + +# Node/JS (falls vorhanden) +node_modules/ + +# OS/Editor +.DS_Store +Thumbs.db + +# Test/Cache +.pytest_cache/ +.cache/ + +# Logs/Temp +logs/ +tmp/ + +# ENV/Secrets (WICHTIG!) +.env +*/.env +*.env +secrets/ diff --git a/knowledge/karatetrainer/_imported/Lateral.txt b/knowledge/karatetrainer/_imported/Lateral.txt new file mode 100644 index 0000000..c84abf8 --- /dev/null +++ b/knowledge/karatetrainer/_imported/Lateral.txt @@ -0,0 +1,5 @@ +Ein Lateral bezeichnet die Fülle an Dingen, die eine bestimmte Abhängigkeit voneinander haben, aber trotzdem eine sinnvolle in sich abgeschlossenes Wissenselement darstellen. + +Laterale in der Sportwissenschaft beziehen sich auf die unabhängigen Bewegungen der Extremitäten (Arme, Beine) die zu einer Gesamtbewegung zusammengefasst werden können. So sind z.B. beim Balancieren die Beine beteiligt, aber auch die Arme und der Oberkörper um ausgleichend wirken zu können. + +Das zusammengenommen bildet eine wesentliche Betrachtung im Aufbau der Trainingspläne und Konzepte. diff --git a/knowledge/karatetrainer/_imported/example_martial_arts.pdf b/knowledge/karatetrainer/_imported/example_martial_arts.pdf new file mode 100644 index 0000000..a745721 --- /dev/null +++ b/knowledge/karatetrainer/_imported/example_martial_arts.pdf @@ -0,0 +1,68 @@ +%PDF-1.3 +% ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/Contents 7 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 6 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +4 0 obj +<< +/PageMode /UseNone /Pages 6 0 R /Type /Catalog +>> +endobj +5 0 obj +<< +/Author (anonymous) /CreationDate (D:20250806060113+00'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20250806060113+00'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (unspecified) /Title (untitled) /Trapped /False +>> +endobj +6 0 obj +<< +/Count 1 /Kids [ 3 0 R ] /Type /Pages +>> +endobj +7 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 951 +>> +stream +Gaqc5gMYJ*&:O"Kb[Wq1BLkMnD[F*C5tGi"''dZo*nuV??"kqujQeh4:@2"X,%Yt9k;#,s$[h+$2t)P\>C(KLVtk\nQ8DsoQF-T[f@r*OcVHm:h=,W66*#q>Cpj,L9#iC@r/)NNle7flVR:\:)->OZ5h]%cX5@`^(jqhh.^/O;ejWO4^hk#b25._.=AtnUmArZ7^S1$:g:jET%AQ2-)943R/aL&E4B5WGY(^AJQebL2\o"bV\E5?(7m'a"kqTAO$iQ.Y5QK;>LJBP-lfeqKbFU(sJce^a,>?+&HSt9F)*1.uM0;&Q1^!#[3NnE['%ThVeX^FUm^atB6*iS-4V%!q&")8id5Q+$'.EP`a&9-'!"2:m^=LS).s<&4K.i\WB=*]5B2E6@Vm!QU*'Xl2ir$/!UB_:V8^G$e/XqjCoED!W'NZ8aJ22BseM=&").8te;U;*qFs<[Ti%UjNkR6dM(X&W_5*Pq[+\s?NGZiJLUi=;"dJ5XclcePb\>"UjBk\J#e21+`3SP6\'Sq\DYM`endstream +endobj +xref +0 8 +0000000000 65535 f +0000000073 00000 n +0000000104 00000 n +0000000211 00000 n +0000000414 00000 n +0000000482 00000 n +0000000778 00000 n +0000000837 00000 n +trailer +<< +/ID +[] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 5 0 R +/Root 4 0 R +/Size 8 +>> +startxref +1878 +%%EOF diff --git a/knowledge/karatetrainer/_imported/hikete.txt b/knowledge/karatetrainer/_imported/hikete.txt new file mode 100644 index 0000000..86faaf8 --- /dev/null +++ b/knowledge/karatetrainer/_imported/hikete.txt @@ -0,0 +1 @@ +Hikite ist die zurückziehende Bewegung der nicht schlagenden Hand. Sie stabilisiert und verstärkt die Technik diff --git a/knowledge/karatetrainer/_imported/mae-geri.txt b/knowledge/karatetrainer/_imported/mae-geri.txt new file mode 100644 index 0000000..c94af26 --- /dev/null +++ b/knowledge/karatetrainer/_imported/mae-geri.txt @@ -0,0 +1 @@ +Mae-geri ist ein gerader Fußtritt nach vorne. Er wird durch das Anheben des Knies und das Ausstrecken des Beins ausgeführt. diff --git a/knowledge/karatetrainer/_imported/quatsch.txt b/knowledge/karatetrainer/_imported/quatsch.txt new file mode 100644 index 0000000..8884223 --- /dev/null +++ b/knowledge/karatetrainer/_imported/quatsch.txt @@ -0,0 +1 @@ +Eine Talahonattacke besthet aus einem Tornadokick mit anschließender 360 Grad Drehung und dem Ausatmen von Knoblauchatem diff --git a/knowledge/karatetrainer/_imported/test_karate_chudan_zuki.txt b/knowledge/karatetrainer/_imported/test_karate_chudan_zuki.txt new file mode 100644 index 0000000..4dfa7cf --- /dev/null +++ b/knowledge/karatetrainer/_imported/test_karate_chudan_zuki.txt @@ -0,0 +1,12 @@ +Die Technik Chūdan-zuki ist ein mittlerer Fauststoß, der auf den Solarplexus oder die Brust des Gegners abzielt. Sie gehört zu den Grundtechniken des Karate und wird in vielen Kata und Kumite-Formen eingesetzt. Der Stoß beginnt aus der Hikite-Position, bei der eine Faust an der Hüfte zurückgezogen ist. Während der Stoß ausgeführt wird, rotiert der Oberkörper leicht und der Unterarm bleibt auf einer geraden Linie zum Ziel. + +Wichtig ist, dass der Stoß exakt in der Körpermitte ausgeführt wird und die Schulter entspannt bleibt. Die Kraft entsteht durch die Verbindung von Hüftrotation, Körperspannung und Atmung. Bei korrekter Ausführung ist Chūdan-zuki eine effektive Technik zur Selbstverteidigung. + +Trainingshinweise: +- Achte auf eine saubere Körperhaltung. +- Halte das Handgelenk stabil, um Verletzungen zu vermeiden. +- Übe die Technik zunächst langsam und steigere dann die Geschwindigkeit. +- Beobachte die Hüftrotation im Spiegel oder lasse dich von einem Partner korrigieren. +- Führe mindestens 20 Wiederholungen pro Seite aus. + +In Kombination mit Gedan-barai und Age-uke ergibt sich eine effektive Abwehr- und Kontertechnik. Diese Kombination ist besonders im Gōjū-Ryū und Shotokan-Stil verbreitet. diff --git a/knowledge/karatetrainer/_imported/test_multiple_chunks.txt b/knowledge/karatetrainer/_imported/test_multiple_chunks.txt new file mode 100644 index 0000000..fbd2a75 --- /dev/null +++ b/knowledge/karatetrainer/_imported/test_multiple_chunks.txt @@ -0,0 +1,19 @@ +Dies ist ein erster kurzer Absatz über Mae-Geri. + +Der zweite Absatz behandelt Yoko-Geri und seine biomechanische Ausführung. + +Ein dritter, etwas längerer Absatz beschreibt Fehlerquellen bei der Ausführung des Mawashi-Geri, inklusive falscher Hüftrotation und fehlender Stabilität im Standbein. Dieser Absatz ist länger, um das Chunking-Verhalten bei langen Texten zu testen. + +Abschließend folgt ein kurzer Satz. + +Die Philosophie des Karate-Do geht weit über das körperliche Training hinaus. Sie basiert auf Prinzipien wie Respekt, Selbstdisziplin und stetiger Entwicklung. Viele Dojos orientieren sich an einem Ehrenkodex, der in der Regel zu Beginn und am Ende jeder Einheit rezitiert wird. + +„Do“ steht für den Weg – den Lebensweg. Karate wird nicht als Sport verstanden, den man betreibt, sondern als ein Weg, den man beschreitet. Dieser Weg erfordert Geduld, Ausdauer und den Willen, auch Rückschläge als Teil des Lernprozesses zu akzeptieren. + +In der traditionellen Lehre wird oft betont, dass der beste Kampf derjenige ist, der nicht geführt werden muss. Gewaltverzicht, Kontrolle über Emotionen und Verantwortungsbewusstsein stehen im Vordergrund. Karate soll helfen, innere Stärke und äußere Gelassenheit zu entwickeln. + +Die Philosophie des Karate-Do geht weit über das körperliche Training hinaus. Sie basiert auf Prinzipien wie Respekt, Selbstdisziplin und stetiger Entwicklung. Viele Dojos orientieren sich an einem Ehrenkodex, der in der Regel zu Beginn und am Ende jeder Einheit rezitiert wird. + +„Do“ steht für den Weg – den Lebensweg. Karate wird nicht als Sport verstanden, den man betreibt, sondern als ein Weg, den man beschreitet. Dieser Weg erfordert Geduld, Ausdauer und den Willen, auch Rückschläge als Teil des Lernprozesses zu akzeptieren. + +In der traditionellen Lehre wird oft betont, dass der beste Kampf derjenige ist, der nicht geführt werden muss. Gewaltverzicht, Kontrolle über Emotionen und Verantwortungsbewusstsein stehen im Vordergrund. Karate soll helfen, innere Stärke und äußere Gelassenheit zu entwickeln. diff --git a/knowledge/karatetrainer/_imported/test_para_chunks.txt b/knowledge/karatetrainer/_imported/test_para_chunks.txt new file mode 100644 index 0000000..87d5853 --- /dev/null +++ b/knowledge/karatetrainer/_imported/test_para_chunks.txt @@ -0,0 +1,7 @@ +Dies ist ein erster kurzer Absatz über Mae-Geri. + +Der zweite Absatz behandelt Yoko-Geri und seine biomechanische Ausführung. + +Ein dritter, etwas längerer Absatz beschreibt Fehlerquellen bei der Ausführung des Mawashi-Geri, inklusive falscher Hüftrotation und fehlender Stabilität im Standbein. Dieser Absatz ist länger, um das Chunking-Verhalten bei langen Texten zu testen. + +Abschließend folgt ein kurzer Satz. diff --git a/knowledge/karatetrainer/_imported/zuki.txt b/knowledge/karatetrainer/_imported/zuki.txt new file mode 100644 index 0000000..3fe9fac --- /dev/null +++ b/knowledge/karatetrainer/_imported/zuki.txt @@ -0,0 +1 @@ +Zuki bezeichnet einen Fauststoß im Karate. Er kann als Choku-zuki (gerader Stoß) oder Gyaku-zuki (gegengesetzter Stoß) ausgeführt werden. diff --git a/llm-api/archiv/app.py b/llm-api/archiv/app.py new file mode 100644 index 0000000..bf92fd1 --- /dev/null +++ b/llm-api/archiv/app.py @@ -0,0 +1,110 @@ +from fastapi import FastAPI, Query +from pydantic import BaseModel +from typing import List +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct +import requests + +app = FastAPI() + +# Initialisierung +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient(host="localhost", port=6333) +# COLLECTION = "karate-doku" +OLLAMA_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "mistral" # kann später auch geändert werden + +# Embedding-Input +class EmbedRequest(BaseModel): + texts: List[str] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + + +@app.post("/embed") + +def embed_texts(data: EmbedRequest): + collection_name = data.collection + + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + + embeddings = model.encode(data.texts).tolist() + points = [ + PointStruct(id=i, vector=vec, payload={"text": data.texts[i]}) + for i, vec in enumerate(embeddings) + ] + qdrant.upsert(collection_name=collection_name, points=points) + return {"status": "✅ embeddings saved", "count": len(points), "collection": collection_name} + +@app.get("/search") +def search_text(query: str = Query(...), limit: int = 3, collection: str = Query(...)): + vec = model.encode(query).tolist() + results = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload["text"]} for r in results] + +@app.post("/prompt") +def generate_prompt(data: PromptRequest): + query_vec = model.encode(data.query).tolist() + + # Suche relevante Einträge aus der angegebenen Collection + results = qdrant.search( + collection_name=data.collection, + query_vector=query_vec, + limit=data.context_limit + ) + + # Kontext für den Prompt aus den gefundenen Texten zusammenbauen + context = "\n".join([r.payload["text"] for r in results]) + full_prompt = f"""Beantworte die folgende Frage basierend auf dem Kontext: + +Kontext: +{context} + +Frage: +{data.query} +""" + + # Anfrage an Ollama stellen + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": full_prompt, + "stream": False + } + + response = requests.post(OLLAMA_URL, json=ollama_payload) + response.raise_for_status() + answer = response.json()["response"] + + return { + "answer": answer, + "context": context, + "collection": data.collection + } + + +Kontext: +{context} + +Frage: +{data.query} +""" + + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": full_prompt, + "stream": False + } + + response = requests.post(OLLAMA_URL, json=ollama_payload) + response.raise_for_status() + answer = response.json()["response"] + return {"answer": answer, "context": context} diff --git a/llm-api/archiv/llm_apiV20.py b/llm-api/archiv/llm_apiV20.py new file mode 100644 index 0000000..8a89d08 --- /dev/null +++ b/llm-api/archiv/llm_apiV20.py @@ -0,0 +1,167 @@ +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from fastapi.openapi.utils import get_openapi +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from qdrant_client.http.models import Filter, FieldCondition, MatchValue +from uuid import uuid4 +import requests +import os +from datetime import datetime + +# Version hochgezählt +__version__ = "1.0.20" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# Ollama-Konfiguration +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:latest") + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Karate- & Gewaltschutz-Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# Datenmodelle +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# Embedding-Modell und Qdrant-Client +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient(host=os.getenv("QDRANT_HOST", "localhost"), port=int(os.getenv("QDRANT_PORT", 6333))) + +# /embed +@app.post("/embed", response_model=EmbedResponse) +def embed_texts(data: EmbedRequest): + if not data.chunks: + raise HTTPException(status_code=400, detail="'chunks' darf nicht leer sein.") + coll = data.collection + if not qdrant.collection_exists(coll): + qdrant.recreate_collection(collection_name=coll, + vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [PointStruct(id=str(uuid4()), vector=embeddings[i], payload={'text': c.text, 'source': c.source}) + for i, c in enumerate(data.chunks)] + qdrant.upsert(collection_name=coll, points=points) + return EmbedResponse(status="✅ Saved", count=len(points), collection=coll) + +# /search +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +# /prompt +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search(collection_name=data.collection, query_vector=model.encode(data.query).tolist(), limit=data.context_limit) + context = '\n'.join(h.payload['text'] for h in hits) + payload = {'model': OLLAMA_MODEL, 'prompt': f"Context:\n{context}\nQuestion: {data.query}", 'stream': False} + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=30); r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get('response', ''), context=context, collection=data.collection) + +# /delete-source (neue Routine gemäß ursprünglicher funktionierender Logik) +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: str = Query(...), + type: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + # Filter-Bedingungen + must = [{"key": "source", "match": {"value": source}}] + if type: + must.append({"key": "type", "match": {"value": type}}) + # IDs sammeln via scroll_filter + try: + points, _ = qdrant.scroll( + collection_name=collection, + scroll_filter={"must": must}, + limit=10000 + ) + except Exception as exc: + print(f"[ERROR] Scroll failed: {exc}", flush=True) + raise HTTPException(status_code=500, detail="Fehler beim Abrufen der Punkte vor dem Löschen.") + point_ids = [str(pt.id) for pt in points] + if not point_ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection, source=source, type=type) + # Lösche mittels PointIdsList(points=...) + try: + qdrant.delete( + collection_name=collection, + points_selector=PointIdsList(points=point_ids) + ) + except Exception as exc: + print(f"[ERROR] Delete failed: {exc}", flush=True) + raise HTTPException(status_code=500, detail="Fehler beim Löschen nach Source.") + return DeleteResponse(status="🗑️ gelöscht", count=len(point_ids), collection=collection, source=source, type=type) + +# /delete-collection +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) diff --git a/llm-api/archiv/llm_api_bk.py b/llm-api/archiv/llm_api_bk.py new file mode 100644 index 0000000..99203bb --- /dev/null +++ b/llm-api/archiv/llm_api_bk.py @@ -0,0 +1,196 @@ +from fastapi import FastAPI, Query, HTTPException +from pydantic import BaseModel +from typing import List +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct +from fastapi import HTTPException +from uuid import uuid4 +import requests +from datetime import datetime + +from qdrant_client.models import PointIdsList + + +app = FastAPI() + +# Konfiguration +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient(host="localhost", port=6333) +OLLAMA_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "mistral" + +# Datenmodelle + +from typing import List, Dict, Any + +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "file" + title: str | None = None + version: str | None = None + related_to: str | None = None + tags: List[str] = [] + owner: str | None = None + context_tag: str | None = None + imported_at: str | None = None + chunk_index: int | None = None + category: str | None = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + + + + + +@app.delete("/delete-source") +def delete_by_source( + collection: str = Query(...), + source: str = Query(...), + type: str = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + must = [{"key": "source", "match": {"value": source}}] + if type: + must.append({"key": "type", "match": {"value": type}}) + + result = qdrant.scroll( + collection_name=collection, + scroll_filter={"must": must}, + limit=10000 + ) + + points = result[0] + if not points: + return {"status": "🔍 Keine passenden Einträge gefunden."} + + point_ids = [] + for point in points: + pid = point.id + point_ids.append(str(pid)) # immer zu String casten + + qdrant.delete( + collection_name=collection, + points_selector=PointIdsList(points=point_ids) + + ) + + return { + "status": "🗑️ gelöscht", + "count": len(point_ids), + "collection": collection, + "source": source, + "type": type + } + + + +@app.delete("/delete-collection") +def delete_collection(collection: str = Query(...)): + """ + Löscht eine gesamte Collection aus Qdrant. + """ + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + qdrant.delete_collection(collection_name=collection) + return {"status": "🗑️ gelöscht", "collection": collection} + + +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + + embeddings = model.encode([chunk.text for chunk in data.chunks]).tolist() + + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + + point = PointStruct( + id=str(uuid4()), + vector=embeddings[i], + payload=payload + ) + points.append(point) + + qdrant.upsert(collection_name=collection_name, points=points) + + return { + "status": "✅ embeddings saved", + "count": len(points), + "collection": collection_name + } + + + + +@app.get("/search") +def search_text(query: str = Query(...), limit: int = 3, collection: str = Query(...)): + vec = model.encode(query).tolist() + results = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload["text"]} for r in results] + +@app.post("/prompt") +def generate_prompt(data: PromptRequest): + query_vec = model.encode(data.query).tolist() + results = qdrant.search( + collection_name=data.collection, + query_vector=query_vec, + limit=data.context_limit + ) + + context = "\n".join([r.payload["text"] for r in results]) + full_prompt = f"""Beantworte die folgende Frage basierend auf dem Kontext: + +Kontext: +{context} + +Frage: +{data.query} +""" + + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": full_prompt, + "stream": False + } + + response = requests.post(OLLAMA_URL, json=ollama_payload) + response.raise_for_status() + answer = response.json()["response"] + + return { + "answer": answer, + "context": context, + "collection": data.collection + } diff --git a/llm-api/archiv/llm_api_bk2.py b/llm-api/archiv/llm_api_bk2.py new file mode 100644 index 0000000..99203bb --- /dev/null +++ b/llm-api/archiv/llm_api_bk2.py @@ -0,0 +1,196 @@ +from fastapi import FastAPI, Query, HTTPException +from pydantic import BaseModel +from typing import List +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct +from fastapi import HTTPException +from uuid import uuid4 +import requests +from datetime import datetime + +from qdrant_client.models import PointIdsList + + +app = FastAPI() + +# Konfiguration +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient(host="localhost", port=6333) +OLLAMA_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "mistral" + +# Datenmodelle + +from typing import List, Dict, Any + +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "file" + title: str | None = None + version: str | None = None + related_to: str | None = None + tags: List[str] = [] + owner: str | None = None + context_tag: str | None = None + imported_at: str | None = None + chunk_index: int | None = None + category: str | None = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + + + + + +@app.delete("/delete-source") +def delete_by_source( + collection: str = Query(...), + source: str = Query(...), + type: str = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + must = [{"key": "source", "match": {"value": source}}] + if type: + must.append({"key": "type", "match": {"value": type}}) + + result = qdrant.scroll( + collection_name=collection, + scroll_filter={"must": must}, + limit=10000 + ) + + points = result[0] + if not points: + return {"status": "🔍 Keine passenden Einträge gefunden."} + + point_ids = [] + for point in points: + pid = point.id + point_ids.append(str(pid)) # immer zu String casten + + qdrant.delete( + collection_name=collection, + points_selector=PointIdsList(points=point_ids) + + ) + + return { + "status": "🗑️ gelöscht", + "count": len(point_ids), + "collection": collection, + "source": source, + "type": type + } + + + +@app.delete("/delete-collection") +def delete_collection(collection: str = Query(...)): + """ + Löscht eine gesamte Collection aus Qdrant. + """ + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + qdrant.delete_collection(collection_name=collection) + return {"status": "🗑️ gelöscht", "collection": collection} + + +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + + embeddings = model.encode([chunk.text for chunk in data.chunks]).tolist() + + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + + point = PointStruct( + id=str(uuid4()), + vector=embeddings[i], + payload=payload + ) + points.append(point) + + qdrant.upsert(collection_name=collection_name, points=points) + + return { + "status": "✅ embeddings saved", + "count": len(points), + "collection": collection_name + } + + + + +@app.get("/search") +def search_text(query: str = Query(...), limit: int = 3, collection: str = Query(...)): + vec = model.encode(query).tolist() + results = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload["text"]} for r in results] + +@app.post("/prompt") +def generate_prompt(data: PromptRequest): + query_vec = model.encode(data.query).tolist() + results = qdrant.search( + collection_name=data.collection, + query_vector=query_vec, + limit=data.context_limit + ) + + context = "\n".join([r.payload["text"] for r in results]) + full_prompt = f"""Beantworte die folgende Frage basierend auf dem Kontext: + +Kontext: +{context} + +Frage: +{data.query} +""" + + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": full_prompt, + "stream": False + } + + response = requests.post(OLLAMA_URL, json=ollama_payload) + response.raise_for_status() + answer = response.json()["response"] + + return { + "answer": answer, + "context": context, + "collection": data.collection + } diff --git a/llm-api/archiv/llm_api_v1.py b/llm-api/archiv/llm_api_v1.py new file mode 100644 index 0000000..a5b2960 --- /dev/null +++ b/llm-api/archiv/llm_api_v1.py @@ -0,0 +1,220 @@ +from fastapi import FastAPI, Query, HTTPException +from fastapi.openapi.utils import get_openapi +from pydantic import BaseModel +from typing import List +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct +from fastapi import HTTPException +from uuid import uuid4 +import requests +from datetime import datetime + +from qdrant_client.models import PointIdsList + + +app = FastAPI( + title="Lokaler KI Agent", + description="Lokale API zur Ansteuerung des LLM und qdrant", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json", +) + + +# Konfiguration +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient(host="localhost", port=6333) +OLLAMA_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "mistral" + +# Datenmodelle + +from typing import List, Dict, Any + +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "file" + title: str | None = None + version: str | None = None + related_to: str | None = None + tags: List[str] = [] + owner: str | None = None + context_tag: str | None = None + imported_at: str | None = None + chunk_index: int | None = None + category: str | None = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + + + + + +@app.delete("/delete-source") +def delete_by_source( + collection: str = Query(...), + source: str = Query(...), + type: str = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + must = [{"key": "source", "match": {"value": source}}] + if type: + must.append({"key": "type", "match": {"value": type}}) + + result = qdrant.scroll( + collection_name=collection, + scroll_filter={"must": must}, + limit=10000 + ) + + points = result[0] + if not points: + return {"status": "🔍 Keine passenden Einträge gefunden."} + + point_ids = [] + for point in points: + pid = point.id + point_ids.append(str(pid)) # immer zu String casten + + qdrant.delete( + collection_name=collection, + points_selector=PointIdsList(points=point_ids) + + ) + + return { + "status": "🗑️ gelöscht", + "count": len(point_ids), + "collection": collection, + "source": source, + "type": type + } + + + +@app.delete("/delete-collection") +def delete_collection(collection: str = Query(...)): + """ + Löscht eine gesamte Collection aus Qdrant. + """ + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + + qdrant.delete_collection(collection_name=collection) + return {"status": "🗑️ gelöscht", "collection": collection} + + +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + + embeddings = model.encode([chunk.text for chunk in data.chunks]).tolist() + + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + + point = PointStruct( + id=str(uuid4()), + vector=embeddings[i], + payload=payload + ) + points.append(point) + + qdrant.upsert(collection_name=collection_name, points=points) + + return { + "status": "✅ embeddings saved", + "count": len(points), + "collection": collection_name + } + + + + +@app.get("/search") +def search_text(query: str = Query(...), limit: int = 3, collection: str = Query(...)): + vec = model.encode(query).tolist() + results = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload["text"]} for r in results] + +@app.post("/prompt") +def generate_prompt(data: PromptRequest): + query_vec = model.encode(data.query).tolist() + results = qdrant.search( + collection_name=data.collection, + query_vector=query_vec, + limit=data.context_limit + ) + + context = "\n".join([r.payload["text"] for r in results]) + full_prompt = f"""Beantworte die folgende Frage basierend auf dem Kontext: + +Kontext: +{context} + +Frage: +{data.query} +""" + + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": full_prompt, + "stream": False + } + + response = requests.post(OLLAMA_URL, json=ollama_payload) + response.raise_for_status() + answer = response.json()["response"] + + return { + "answer": answer, + "context": context, + "collection": data.collection + } + +def custom_openapi(): + if app.openapi_schema: + return app.openapi_schema + openapi_schema = get_openapi( + title=app.title, + version=app.version, + description=app.description, + routes=app.routes, + ) + # hier können z.B. Security-Schemes ergänzt werden + app.openapi_schema = openapi_schema + return app.openapi_schema + +app.openapi = custom_openapi diff --git a/llm-api/clients.py b/llm-api/clients.py new file mode 100644 index 0000000..4b00484 --- /dev/null +++ b/llm-api/clients.py @@ -0,0 +1,24 @@ +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance +import os + +# Embedding-Modell +model = SentenceTransformer("all-MiniLM-L6-v2") + +# Qdrant-Client +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Collections initialisieren +for coll in ["exercises", "training_plans"]: + if not qdrant.collection_exists(coll): + qdrant.recreate_collection( + collection_name=coll, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) diff --git a/llm-api/embed_router.py b/llm-api/embed_router.py new file mode 100644 index 0000000..32465b1 --- /dev/null +++ b/llm-api/embed_router.py @@ -0,0 +1,126 @@ +from fastapi import APIRouter, HTTPException, Query +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Optional +from uuid import uuid4 +from clients import model, qdrant +from qdrant_client.models import PointStruct, VectorParams, Distance, PointIdsList +import requests, os + +router = APIRouter() + +# Models +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str = Field(..., description="Suchanfrage") + context_limit: int = Field(default=3, ge=1, le=10, description="Anzahl Kontext-Dokumente") + collection: str = Field(default="default", description="Qdrant-Collection") + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# Endpoints +@router.post("/embed") +def embed_texts(data: EmbedRequest): + if not qdrant.collection_exists(data.collection): + qdrant.recreate_collection( + collection_name=data.collection, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [PointStruct(id=str(uuid4()), vector=emb, payload=c.dict()) + for emb, c in zip(embeddings, data.chunks)] + qdrant.upsert(collection_name=data.collection, points=points) + return {"status": "✅ embeddings saved", "count": len(points), "collection": data.collection} + +@router.get("/search") +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload.get("text", "")} for r in res] + +@router.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload.get("text", "") for h in hits) + llm_url = os.getenv("OLLAMA_URL") + if not llm_url: + raise HTTPException(status_code=500, detail="LLM-Service-URL nicht konfiguriert.") + payload = { + "model": os.getenv("OLLAMA_MODEL"), + "prompt": f"Context:\n{context}\nQuestion: {data.query}", + "stream": False + } + try: + r = requests.post(llm_url, json=payload, timeout=30) + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"LLM-Service-Fehler: {e}") + return PromptResponse(answer=r.json().get("response", ""), context=context, collection=data.collection) + +@router.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: + filt.append({"key": "source", "match": {"value": source}}) + if type: + filt.append({"key": "type", "match": {"value": type}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must": filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +# Delete entire collection +@router.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection( + collection: str = Query(...) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) diff --git a/llm-api/embed_router1.1.0.py b/llm-api/embed_router1.1.0.py new file mode 100644 index 0000000..32465b1 --- /dev/null +++ b/llm-api/embed_router1.1.0.py @@ -0,0 +1,126 @@ +from fastapi import APIRouter, HTTPException, Query +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Optional +from uuid import uuid4 +from clients import model, qdrant +from qdrant_client.models import PointStruct, VectorParams, Distance, PointIdsList +import requests, os + +router = APIRouter() + +# Models +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str = Field(..., description="Suchanfrage") + context_limit: int = Field(default=3, ge=1, le=10, description="Anzahl Kontext-Dokumente") + collection: str = Field(default="default", description="Qdrant-Collection") + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# Endpoints +@router.post("/embed") +def embed_texts(data: EmbedRequest): + if not qdrant.collection_exists(data.collection): + qdrant.recreate_collection( + collection_name=data.collection, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [PointStruct(id=str(uuid4()), vector=emb, payload=c.dict()) + for emb, c in zip(embeddings, data.chunks)] + qdrant.upsert(collection_name=data.collection, points=points) + return {"status": "✅ embeddings saved", "count": len(points), "collection": data.collection} + +@router.get("/search") +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [{"score": r.score, "text": r.payload.get("text", "")} for r in res] + +@router.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload.get("text", "") for h in hits) + llm_url = os.getenv("OLLAMA_URL") + if not llm_url: + raise HTTPException(status_code=500, detail="LLM-Service-URL nicht konfiguriert.") + payload = { + "model": os.getenv("OLLAMA_MODEL"), + "prompt": f"Context:\n{context}\nQuestion: {data.query}", + "stream": False + } + try: + r = requests.post(llm_url, json=payload, timeout=30) + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"LLM-Service-Fehler: {e}") + return PromptResponse(answer=r.json().get("response", ""), context=context, collection=data.collection) + +@router.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: + filt.append({"key": "source", "match": {"value": source}}) + if type: + filt.append({"key": "type", "match": {"value": type}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must": filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +# Delete entire collection +@router.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection( + collection: str = Query(...) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) diff --git a/llm-api/exercise_router.py b/llm-api/exercise_router.py new file mode 100644 index 0000000..f3d42aa --- /dev/null +++ b/llm-api/exercise_router.py @@ -0,0 +1,181 @@ +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel, Field +from typing import List, Optional, Dict, Any +from uuid import uuid4 +from datetime import datetime, date +from clients import model, qdrant +from qdrant_client.models import PointStruct, VectorParams, Distance, PointIdsList +import os + +router = APIRouter() + +# ---- Models ---- +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + + capabilities: Dict[str, int] = {} + category: str + + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + phases: List[PlanPhase] + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# ---- CRUD Endpoints for Exercise ---- +@router.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + # Ensure Exercise collection exists + if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name="exercises", points=[point]) + return ex + +@router.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key": "discipline", "match": {"value": discipline}}) + if group: + filters.append({"key": "group", "match": {"value": group}}) + if tags: + for t in tags.split(","): + filters.append({"key": "keywords", "match": {"value": t.strip()}}) + pts, _ = qdrant.scroll( + collection_name="exercises", + scroll_filter={"must": filters} if filters else None, + limit=10000 + ) + return [Exercise(**pt.payload) for pt in pts] + +# ---- CRUD Endpoints for TrainingPlan ---- +@router.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + # Ensure TrainingPlan collection exists + if not qdrant.collection_exists("training_plans"): + qdrant.recreate_collection( + collection_name="training_plans", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + point = PointStruct(id=plan.id, vector=vec, payload=plan.dict()) + qdrant.upsert(collection_name="training_plans", points=[point]) + return plan + +@router.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + collection: str = Query("training_plans"), + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + return [] + pts, _ = qdrant.scroll(collection_name=collection, limit=10000) + result = [] + for pt in pts: + pl = TrainingPlan(**pt.payload) + if discipline and pl.discipline != discipline: + continue + if group and pl.group != group: + continue + if dojo and pl.dojo != dojo: + continue + result.append(pl) + return result + +# ---- Delete Endpoints ---- +@router.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: + filt.append({"key": "source", "match": {"value": source}}) + if type: + filt.append({"key": "type", "match": {"value": type}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must": filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@router.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection( + collection: str = Query(...) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) diff --git a/llm-api/llm_api.py b/llm-api/llm_api.py new file mode 100644 index 0000000..fffef53 --- /dev/null +++ b/llm-api/llm_api.py @@ -0,0 +1,33 @@ +from dotenv import load_dotenv +load_dotenv() # Lädt Variablen aus .env in os.environ + + +from fastapi import FastAPI +from fastapi.responses import JSONResponse +from clients import model, qdrant +from wiki_router import router as wiki_router +from embed_router import router as embed_router +from exercise_router import router as exercise_router + +# Version +__version__ = "1.1.6" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + + + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Modulare API für Trainingsplanung und MediaWiki-Import", + version=__version__, +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request, exc): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler."}) + +# Router einbinden +app.include_router(wiki_router, prefix="/import/wiki") +app.include_router(embed_router) +app.include_router(exercise_router) diff --git a/llm-api/old strukture/llm_api.py b/llm-api/old strukture/llm_api.py new file mode 100644 index 0000000..0a20440 --- /dev/null +++ b/llm-api/old strukture/llm_api.py @@ -0,0 +1,421 @@ +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from uuid import uuid4 +import requests +import os +from datetime import datetime, date + +# Version hochgezählt +__version__ = "1.1.6" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# Ollama-Konfiguration +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:latest") + +# ----------------------- +# MediaWiki-Konfiguration +# ----------------------- +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# Globaler Session für MediaWiki-API +wiki_session = requests.Session() + +# Health-Check für MediaWiki +@app.get("/import/wiki/health") +def wiki_health(): + """ + Prüft, ob der MediaWiki-Server erreichbar ist. + """ + params = {"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=5) + r.raise_for_status() + resp = r.json() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + # Versuche Servernamen auszulesen, aber gib OK zurück, wenn es fehlt + server = resp.get("query", {}).get("general", {}).get("servername") + if server: + return {"status": "ok", "server": server} + return {"status": "ok", "server": None} + +# ------------------------ +# MediaWiki Login Endpoint +# ------------------------ +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: Optional[str] = None + +@app.post("/import/wiki/login", response_model=WikiLoginResponse) +def wiki_login(data: WikiLoginRequest): + """ + Führt Login gegen MediaWiki-API durch und speichert Session-Cookies. + """ + # Schritt 1: Login-Token holen + params_token = {"action": "query", "meta": "tokens", "type": "login", "format": "json"} + try: + resp1 = wiki_session.get(WIKI_API_URL, params=params_token, timeout=10) + resp1.raise_for_status() + token = resp1.json().get("query", {}).get("tokens", {}).get("logintoken") + if not token: + raise ValueError("Kein Login-Token erhalten") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Fehler Token abrufen: {e}") + + # Schritt 2: Login mit Token + login_data = { + "action": "login", "format": "json", + "lgname": data.username, "lgpassword": data.password, + "lgtoken": token + } + try: + resp2 = wiki_session.post(WIKI_API_URL, data=login_data, timeout=10) + resp2.raise_for_status() + result = resp2.json().get("login", {}) + if result.get("result") != "Success": + return WikiLoginResponse(status="failed", message=result.get("reason", "Login fehlgeschlagen")) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Fehler Login: {e}") + + return WikiLoginResponse(status="success") + +# ------------------------ + # Fallback: Connectivity ist gegeben, aber kein Servernamen + return {"status": "ok", "server": None} + +# ------------------------"status": "ok", "server": general["servername"]} + +# ------------------------ +# ------------------------ +# Modelle für Embed/Search +# ------------------------ +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# ------------------------------------ +# Modelle für Exercises & Plans +# ------------------------------------ +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + + capabilities: Dict[str,int] = {} + category: str + + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str = "" + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str = "" + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + + phases: List[PlanPhase] + +# ---------------------------------- +# Embedding-Modell und Qdrant-Client +# ---------------------------------- +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Ensure Exercise-Collection exists +if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# Ensure TrainingPlan-Collection exists +PLAN_COLL = "training_plans" +if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# ---------------------- +# Endpunkte für Exercises +# ---------------------- +@app.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + # Ensure collection exists + if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name="exercises", points=[point]) + return ex + +@app.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key":"discipline","match":{"value":discipline}}) + if group: + filters.append({"key":"group","match":{"value":group}}) + if tags: + for t in tags.split(","): + filters.append({"key":"keywords","match":{"value":t.strip()}}) + if filters: + pts, _ = qdrant.scroll( + collection_name="exercises", + scroll_filter={"must": filters}, + limit=10000 + ) + else: + pts, _ = qdrant.scroll(collection_name="exercises", limit=10000) + return [Exercise(**pt.payload) for pt in pts] + +# ----------------- +# Bestehende Endpunkte +# ----------------- +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + points.append(PointStruct(id=str(uuid4()), vector=embeddings[i], payload=payload)) + qdrant.upsert(collection_name=collection_name, points=points) + return {"status":"✅ embeddings saved","count":len(points),"collection":collection_name} + +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): + raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload['text'] for h in hits) + payload = {"model":OLLAMA_MODEL,"prompt":f"Context:\n{context}\nQuestion: {data.query}","stream":False} + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=30) + r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get("response",""), context=context, collection=data.collection) + +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None), + owner: Optional[str] = Query(None), + category: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: filt.append({"key":"source","match":{"value":source}}) + if type: filt.append({"key":"type","match":{"value":type}}) + if owner: filt.append({"key":"owner","match":{"value":owner}}) + if category: filt.append({"key":"category","match":{"value":category}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must":filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) + +# ------------------------ +# Endpunkte für TrainingPlans +# ------------------------ +@app.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + # Ensure plan collection exists + if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + payload = plan.dict() + qdrant.upsert(collection_name=PLAN_COLL, points=[PointStruct(id=plan.id, vector=vec, payload=payload)]) + return plan + +@app.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + collection: str = Query(PLAN_COLL), + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + return [] + pts, _ = qdrant.scroll(collection_name=collection, limit=10000) + result: List[TrainingPlan] = [] + for pt in pts: + plan = TrainingPlan(**pt.payload) + if discipline and plan.discipline != discipline: continue + if group and plan.group != group: continue + if dojo and plan.dojo != dojo: continue + result.append(plan) + return result diff --git a/llm-api/old strukture/llm_api1.1.1.py b/llm-api/old strukture/llm_api1.1.1.py new file mode 100644 index 0000000..7613ceb --- /dev/null +++ b/llm-api/old strukture/llm_api1.1.1.py @@ -0,0 +1,319 @@ +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from fastapi.openapi.utils import get_openapi +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from uuid import uuid4 +import requests +import os +from datetime import datetime, date + +# Version hochgezählt +__version__ = "1.1.1" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# Ollama-Konfiguration +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:latest") + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# ------------------------ +# Modelle für Embed/Search +# ------------------------ +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# ------------------------------------ +# Neue Modelle für Exercises & Plans +# ------------------------------------ +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + + capabilities: Dict[str,int] = {} + category: str + + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str = "" + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str = "" + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + + phases: List[PlanPhase] + +# ---------------------------------- +# Embedding-Modell und Qdrant-Client +# ---------------------------------- +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Ensure Exercise-Collection exists +if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# ---------------------- +# Endpunkte für Exercises +# ---------------------- +@app.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name="exercises", points=[point]) + return ex + +@app.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) # kommagetrennt +): + filters = [] + if discipline: + filters.append({"key":"discipline","match":{"value":discipline}}) + if group: + filters.append({"key":"group","match":{"value":group}}) + if tags: + for t in tags.split(","): + filters.append({"key":"keywords","match":{"value":t.strip()}}) + if filters: + pts, _ = qdrant.scroll( + collection_name="exercises", + scroll_filter={"must": filters}, + limit=10000 + ) + else: + pts, _ = qdrant.scroll(collection_name="exercises", limit=10000) + return [Exercise(**pt.payload) for pt in pts] + +# ----------------- +# Bestehende Endpunkte +# ----------------- +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + points.append(PointStruct(id=str(uuid4()), vector=embeddings[i], payload=payload)) + qdrant.upsert(collection_name=collection_name, points=points) + return {"status":"✅ embeddings saved","count":len(points),"collection":collection_name} + +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload['text'] for h in hits) + payload = {"model":OLLAMA_MODEL,"prompt":f"Context:\n{context}\nQuestion: {data.query}","stream":False} + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=30); r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get("response",""), context=context, collection=data.collection) + +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None), + owner: Optional[str] = Query(None), + category: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: filt.append({"key":"source","match":{"value":source}}) + if type: filt.append({"key":"type","match":{"value":type}}) + if owner: filt.append({"key":"owner","match":{"value":owner}}) + if category: filt.append({"key":"category","match":{"value":category}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must":filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) + +# ------------------------ +# Endpunkte für TrainingPlans +# ------------------------ +@app.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + coll = "training_plans" + if not qdrant.collection_exists(coll): + qdrant.recreate_collection( + collection_name=coll, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + payload = plan.dict() + qdrant.upsert(collection_name=coll, points=[PointStruct(id=plan.id, vector=vec, payload=payload)]) + return plan + +@app.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + collection: str = Query("training_plans"), + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + return [] + pts, _ = qdrant.scroll(collection_name=collection, limit=10000) + result = [] + for p in pts: + pl = TrainingPlan(**p.payload) + if discipline and pl.discipline != discipline: continue + if group and pl.group != group: continue + if dojo and pl.dojo != dojo: continue + result.append(pl) + return result diff --git a/llm-api/old strukture/llm_api1.1.11.py b/llm-api/old strukture/llm_api1.1.11.py new file mode 100644 index 0000000..713c36c --- /dev/null +++ b/llm-api/old strukture/llm_api1.1.11.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +# llm_api.py — Version 1.1.11 + +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from uuid import uuid4 +import requests +import os +from datetime import datetime, date + +# Version hochgezählt +__version__ = "1.1.11" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# FastAPI-Anwendung +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# -------------------------------- +# Modelle für Embed/Search +# -------------------------------- +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# -------------------------------- +# Modelle für Exercises & TrainingPlans +# -------------------------------- +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + capabilities: Dict[str, int] = {} + category: str + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str = "" + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str = "" + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + phases: List[PlanPhase] + +# ---------------------------------- +# Embedding-Modell und Qdrant-Client +# ---------------------------------- +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Collection-Namen +EXERCISE_COLL = "exercises" +PLAN_COLL = "training_plans" + +# Sicherstellen, dass Collections existieren +if not qdrant.collection_exists(EXERCISE_COLL): + qdrant.recreate_collection( + collection_name=EXERCISE_COLL, + vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE) + ) +if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE) + ) + +# ---------------------------------- +# Endpunkte für Exercises +# ---------------------------------- +@app.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name=EXERCISE_COLL, points=[point]) + return ex + +@app.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key": "discipline", "match": {"value": discipline}}) + if group: + filters.append({"key": "group", "match": {"value": group}}) + if tags: + for t in tags.split(","): + filters.append({"key": "keywords", "match": {"value": t.strip()}}) + if filters: + pts, _ = qdrant.scroll(collection_name=EXERCISE_COLL, scroll_filter={"must": filters}, limit=10000) + else: + pts, _ = qdrant.scroll(collection_name=EXERCISE_COLL, limit=10000) + return [Exercise(**pt.payload) for pt in pts] + +# ---------------------------------- +# Endpunkte für TrainingPlans +# ---------------------------------- +@app.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + point = PointStruct(id=plan.id, vector=vec, payload=plan.dict()) + qdrant.upsert(collection_name=PLAN_COLL, points=[point]) + return plan + +@app.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key": "discipline", "match": {"value": discipline}}) + if group: + filters.append({"key": "group", "match": {"value": group}}) + if dojo: + filters.append({"key": "dojo", "match": {"value": dojo}}) + if filters: + pts, _ = qdrant.scroll(collection_name=PLAN_COLL, scroll_filter={"must": filters}, limit=10000) + else: + pts, _ = qdrant.scroll(collection_name=PLAN_COLL, limit=10000) + return [TrainingPlan(**pt.payload) for pt in pts] + +# ---------------------------------- +# Endpunkte Embed/Search und Löschen +# ---------------------------------- +@app.post("/embed", response_model=EmbedResponse) +def embed_texts(data: EmbedRequest): + collection_name = data.collection + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [] + for i, chunk in enumerate(data.chunks): + payload = {**chunk.dict(), "imported_at": chunk.imported_at or datetime.utcnow().isoformat()} + points.append(PointStruct(id=str(uuid4()), vector=embeddings[i], payload=payload)) + qdrant.upsert(collection_name=collection_name, points=points) + return EmbedResponse(status="✅ embeddings saved", count=len(points), collection=collection_name) + +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): + raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search(collection_name=data.collection, query_vector=model.encrypt(data.query).tolist(), limit=data.context_limit) + context = "\n".join(h.payload['text'] for h in hits) + try: + r = requests.post( + os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate"), + json={"model": os.getenv("OLLAMA_MODEL", "mistral:latest"), "prompt": f"Context:\n{context}\nQuestion: {data.query}", "stream": False}, + timeout=30 + ) + r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get("response", ""), context=context, collection=data.collection) + +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), source: Optional[str] = Query(None), type: Optional[str] = Query(None), owner: Optional[str] = Query(None), category: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filters = [] + if source: + filters.append({"key": "source", "match": {"value": source}}) + if type: + filters.append({"key": "type", "match": {"value": type}}) + if owner: + filters.append({"key": "owner", "match": {"value": owner}}) + if category: + filters.append({"key": "category", "match": {"value": category}}) + if not filters: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must": filters}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) + +# ---------------------------------------------------------------- +# MediaWiki-Login (v1.1.11) +# ---------------------------------------------------------------- +MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "https://www.Karatetrainer.de/api.php") +MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "LarsS@APIBot") +MEDIAWIKI_PASSWORD= os.getenv("MEDIAWIKI_PASSWORD", "6snci781sh79tbmvb2u9ld4bkd1i7n5t") +wiki_session = requests.Session() + +@app.post("/import/wiki/login") +async def import_wiki_login(): + try: + params_token = {"action": "query", "meta": "tokens", "type": "login", "format": "json"} + resp1 = wiki_session.get(MEDIAWIKI_API_URL, params=params_token) + resp1.raise_for_status() + token = resp1.json()["query"]["tokens"]["logintoken"] + + login_params = {"action": "login", "format": "json"} + login_data = {"lgname": MEDIAWIKI_USER, "lgpassword": MEDIAWIKI_PASSWORD, "lgtoken": token} + resp2 = wiki_session.post(MEDIAWIKI_API_URL, params=login_params, data=login_data) + resp2.raise_for_status() + result = resp2.json().get("login", {}) + + if result.get("result") == "Success": + return {"status": "✅ MediaWiki login erfolgreich."} + else: + raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {result.get('reason','unbekannter Fehler')}") + + except requests.RequestException as e: + raise HTTPException(status_code=502, detail=f"Fehler bei Wiki-API-Aufruf: {str(e)}") diff --git a/llm-api/old strukture/llm_api1.1.5.py b/llm-api/old strukture/llm_api1.1.5.py new file mode 100644 index 0000000..c3441c9 --- /dev/null +++ b/llm-api/old strukture/llm_api1.1.5.py @@ -0,0 +1,341 @@ +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from uuid import uuid4 +import requests +import os +from datetime import datetime, date + +# Version hochgezählt +__version__ = "1.1.5" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# Ollama-Konfiguration +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:latest") + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# ------------------------ +# Modelle für Embed/Search +# ------------------------ +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# ------------------------------------ +# Modelle für Exercises & Plans +# ------------------------------------ +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + + capabilities: Dict[str,int] = {} + category: str + + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str = "" + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str = "" + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + + phases: List[PlanPhase] + +# ---------------------------------- +# Embedding-Modell und Qdrant-Client +# ---------------------------------- +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Ensure Exercise-Collection exists +if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# Ensure TrainingPlan-Collection exists +PLAN_COLL = "training_plans" +if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# ---------------------- +# Endpunkte für Exercises +# ---------------------- +@app.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + # Ensure collection exists + if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name="exercises", points=[point]) + return ex + +@app.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key":"discipline","match":{"value":discipline}}) + if group: + filters.append({"key":"group","match":{"value":group}}) + if tags: + for t in tags.split(","): + filters.append({"key":"keywords","match":{"value":t.strip()}}) + if filters: + pts, _ = qdrant.scroll( + collection_name="exercises", + scroll_filter={"must": filters}, + limit=10000 + ) + else: + pts, _ = qdrant.scroll(collection_name="exercises", limit=10000) + return [Exercise(**pt.payload) for pt in pts] + +# ----------------- +# Bestehende Endpunkte +# ----------------- +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + points.append(PointStruct(id=str(uuid4()), vector=embeddings[i], payload=payload)) + qdrant.upsert(collection_name=collection_name, points=points) + return {"status":"✅ embeddings saved","count":len(points),"collection":collection_name} + +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): + raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload['text'] for h in hits) + payload = {"model":OLLAMA_MODEL,"prompt":f"Context:\n{context}\nQuestion: {data.query}","stream":False} + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=30) + r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get("response",""), context=context, collection=data.collection) + +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None), + owner: Optional[str] = Query(None), + category: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: filt.append({"key":"source","match":{"value":source}}) + if type: filt.append({"key":"type","match":{"value":type}}) + if owner: filt.append({"key":"owner","match":{"value":owner}}) + if category: filt.append({"key":"category","match":{"value":category}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must":filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) + +# ------------------------ +# Endpunkte für TrainingPlans +# ------------------------ +@app.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + # Ensure plan collection exists + if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + payload = plan.dict() + qdrant.upsert(collection_name=PLAN_COLL, points=[PointStruct(id=plan.id, vector=vec, payload=payload)]) + return plan + +@app.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + collection: str = Query(PLAN_COLL), + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + return [] + pts, _ = qdrant.scroll(collection_name=collection, limit=10000) + result: List[TrainingPlan] = [] + for pt in pts: + plan = TrainingPlan(**pt.payload) + if discipline and plan.discipline != discipline: continue + if group and plan.group != group: continue + if dojo and plan.dojo != dojo: continue + result.append(plan) + return result diff --git a/llm-api/old strukture/llm_api1.1.6.py b/llm-api/old strukture/llm_api1.1.6.py new file mode 100644 index 0000000..0a20440 --- /dev/null +++ b/llm-api/old strukture/llm_api1.1.6.py @@ -0,0 +1,421 @@ +from fastapi import FastAPI, Query, HTTPException, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field +from typing import List, Dict, Any, Optional +from sentence_transformers import SentenceTransformer +from qdrant_client import QdrantClient +from qdrant_client.models import VectorParams, Distance, PointStruct, PointIdsList +from uuid import uuid4 +import requests +import os +from datetime import datetime, date + +# Version hochgezählt +__version__ = "1.1.6" +print(f"[DEBUG] llm_api.py version {__version__} loaded from {__file__}", flush=True) + +# Ollama-Konfiguration +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/api/generate") +OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral:latest") + +# ----------------------- +# MediaWiki-Konfiguration +# ----------------------- +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") + +# FastAPI-Instanz +app = FastAPI( + title="KI Trainerassistent API", + description="Lokale API für Trainingsplanung", + version=__version__, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +# Globaler Fehlerhandler +@app.exception_handler(Exception) +async def unicorn_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={"detail": "Interner Serverfehler. Bitte später erneut versuchen."}) + +# Globaler Session für MediaWiki-API +wiki_session = requests.Session() + +# Health-Check für MediaWiki +@app.get("/import/wiki/health") +def wiki_health(): + """ + Prüft, ob der MediaWiki-Server erreichbar ist. + """ + params = {"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=5) + r.raise_for_status() + resp = r.json() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + # Versuche Servernamen auszulesen, aber gib OK zurück, wenn es fehlt + server = resp.get("query", {}).get("general", {}).get("servername") + if server: + return {"status": "ok", "server": server} + return {"status": "ok", "server": None} + +# ------------------------ +# MediaWiki Login Endpoint +# ------------------------ +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: Optional[str] = None + +@app.post("/import/wiki/login", response_model=WikiLoginResponse) +def wiki_login(data: WikiLoginRequest): + """ + Führt Login gegen MediaWiki-API durch und speichert Session-Cookies. + """ + # Schritt 1: Login-Token holen + params_token = {"action": "query", "meta": "tokens", "type": "login", "format": "json"} + try: + resp1 = wiki_session.get(WIKI_API_URL, params=params_token, timeout=10) + resp1.raise_for_status() + token = resp1.json().get("query", {}).get("tokens", {}).get("logintoken") + if not token: + raise ValueError("Kein Login-Token erhalten") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Fehler Token abrufen: {e}") + + # Schritt 2: Login mit Token + login_data = { + "action": "login", "format": "json", + "lgname": data.username, "lgpassword": data.password, + "lgtoken": token + } + try: + resp2 = wiki_session.post(WIKI_API_URL, data=login_data, timeout=10) + resp2.raise_for_status() + result = resp2.json().get("login", {}) + if result.get("result") != "Success": + return WikiLoginResponse(status="failed", message=result.get("reason", "Login fehlgeschlagen")) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Fehler Login: {e}") + + return WikiLoginResponse(status="success") + +# ------------------------ + # Fallback: Connectivity ist gegeben, aber kein Servernamen + return {"status": "ok", "server": None} + +# ------------------------"status": "ok", "server": general["servername"]} + +# ------------------------ +# ------------------------ +# Modelle für Embed/Search +# ------------------------ +class ChunkInput(BaseModel): + text: str + source: str + source_type: str = "" + title: str = "" + version: str = "" + related_to: str = "" + tags: List[str] = [] + owner: str = "" + context_tag: Optional[str] = None + imported_at: Optional[str] = None + chunk_index: Optional[int] = None + category: Optional[str] = None + +class EmbedRequest(BaseModel): + chunks: List[ChunkInput] + collection: str = "default" + +class PromptRequest(BaseModel): + query: str + context_limit: int = 3 + collection: str = "default" + +class EmbedResponse(BaseModel): + status: str + count: int + collection: str + +class SearchResultItem(BaseModel): + score: float = Field(..., ge=0) + text: str + +class PromptResponse(BaseModel): + answer: str + context: str + collection: str + +class DeleteResponse(BaseModel): + status: str + count: int + collection: str + source: Optional[str] = None + type: Optional[str] = None + +# ------------------------------------ +# Modelle für Exercises & Plans +# ------------------------------------ +class Exercise(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + summary: str + short_description: str + keywords: List[str] = [] + link: Optional[str] = None + + discipline: str + group: Optional[str] = None + age_group: str + target_group: str + min_participants: int + duration_minutes: int + + capabilities: Dict[str,int] = {} + category: str + + purpose: str + execution: str + notes: str + preparation: str + method: str + equipment: List[str] = [] + +class PhaseExercise(BaseModel): + exercise_id: str + cond_load: Dict[str, Any] = {} + coord_load: Dict[str, Any] = {} + instructions: str = "" + +class PlanPhase(BaseModel): + name: str + duration_minutes: int + method: str + method_notes: str = "" + exercises: List[PhaseExercise] + +class TrainingPlan(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + title: str + short_description: str + + collection: str + discipline: str + group: Optional[str] = None + dojo: str + date: date + plan_duration_weeks: int + focus_areas: List[str] = [] + predecessor_plan_id: Optional[str] = None + + age_group: str + created_at: datetime = Field(default_factory=datetime.utcnow) + + phases: List[PlanPhase] + +# ---------------------------------- +# Embedding-Modell und Qdrant-Client +# ---------------------------------- +model = SentenceTransformer("all-MiniLM-L6-v2") +qdrant = QdrantClient( + host=os.getenv("QDRANT_HOST", "localhost"), + port=int(os.getenv("QDRANT_PORT", 6333)) +) + +# Ensure Exercise-Collection exists +if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# Ensure TrainingPlan-Collection exists +PLAN_COLL = "training_plans" +if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + +# ---------------------- +# Endpunkte für Exercises +# ---------------------- +@app.post("/exercise", response_model=Exercise) +def create_exercise(ex: Exercise): + # Ensure collection exists + if not qdrant.collection_exists("exercises"): + qdrant.recreate_collection( + collection_name="exercises", + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{ex.title}. {ex.summary}").tolist() + point = PointStruct(id=ex.id, vector=vec, payload=ex.dict()) + qdrant.upsert(collection_name="exercises", points=[point]) + return ex + +@app.get("/exercise", response_model=List[Exercise]) +def list_exercises( + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + tags: Optional[str] = Query(None) +): + filters = [] + if discipline: + filters.append({"key":"discipline","match":{"value":discipline}}) + if group: + filters.append({"key":"group","match":{"value":group}}) + if tags: + for t in tags.split(","): + filters.append({"key":"keywords","match":{"value":t.strip()}}) + if filters: + pts, _ = qdrant.scroll( + collection_name="exercises", + scroll_filter={"must": filters}, + limit=10000 + ) + else: + pts, _ = qdrant.scroll(collection_name="exercises", limit=10000) + return [Exercise(**pt.payload) for pt in pts] + +# ----------------- +# Bestehende Endpunkte +# ----------------- +@app.post("/embed") +def embed_texts(data: EmbedRequest): + collection_name = data.collection + if not qdrant.collection_exists(collection_name): + qdrant.recreate_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE) + ) + embeddings = model.encode([c.text for c in data.chunks]).tolist() + points = [] + for i, chunk in enumerate(data.chunks): + payload = { + "text": chunk.text, + "source": chunk.source, + "source_type": chunk.source_type, + "title": chunk.title, + "version": chunk.version, + "related_to": chunk.related_to, + "tags": chunk.tags, + "owner": chunk.owner, + "context_tag": chunk.context_tag, + "imported_at": chunk.imported_at or datetime.utcnow().isoformat(), + "chunk_index": chunk.chunk_index, + "category": chunk.category or data.collection + } + points.append(PointStruct(id=str(uuid4()), vector=embeddings[i], payload=payload)) + qdrant.upsert(collection_name=collection_name, points=points) + return {"status":"✅ embeddings saved","count":len(points),"collection":collection_name} + +@app.get("/search", response_model=List[SearchResultItem]) +def search_text(query: str = Query(..., min_length=1), limit: int = Query(3, ge=1), collection: str = Query("default")): + vec = model.encode(query).tolist() + res = qdrant.search(collection_name=collection, query_vector=vec, limit=limit) + return [SearchResultItem(score=r.score, text=r.payload['text']) for r in res] + +@app.post("/prompt", response_model=PromptResponse) +def prompt(data: PromptRequest): + if not data.query.strip(): + raise HTTPException(status_code=400, detail="'query' darf nicht leer sein.") + if not (1 <= data.context_limit <= 10): + raise HTTPException(status_code=400, detail="'context_limit' muss zwischen 1 und 10 liegen.") + hits = qdrant.search( + collection_name=data.collection, + query_vector=model.encode(data.query).tolist(), + limit=data.context_limit + ) + context = "\n".join(h.payload['text'] for h in hits) + payload = {"model":OLLAMA_MODEL,"prompt":f"Context:\n{context}\nQuestion: {data.query}","stream":False} + try: + r = requests.post(OLLAMA_URL, json=payload, timeout=30) + r.raise_for_status() + except Exception: + raise HTTPException(status_code=502, detail="LLM-Service-Fehler.") + return PromptResponse(answer=r.json().get("response",""), context=context, collection=data.collection) + +@app.delete("/delete-source", response_model=DeleteResponse) +def delete_by_source( + collection: str = Query(...), + source: Optional[str] = Query(None), + type: Optional[str] = Query(None), + owner: Optional[str] = Query(None), + category: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + filt = [] + if source: filt.append({"key":"source","match":{"value":source}}) + if type: filt.append({"key":"type","match":{"value":type}}) + if owner: filt.append({"key":"owner","match":{"value":owner}}) + if category: filt.append({"key":"category","match":{"value":category}}) + if not filt: + raise HTTPException(status_code=400, detail="Mindestens ein Filterparameter muss angegeben werden.") + pts, _ = qdrant.scroll(collection_name=collection, scroll_filter={"must":filt}, limit=10000) + ids = [str(p.id) for p in pts] + if not ids: + return DeleteResponse(status="🔍 Keine passenden Einträge gefunden.", count=0, collection=collection) + qdrant.delete(collection_name=collection, points_selector=PointIdsList(points=ids)) + return DeleteResponse(status="🗑️ gelöscht", count=len(ids), collection=collection) + +@app.delete("/delete-collection", response_model=DeleteResponse) +def delete_collection(collection: str = Query(...)): + if not qdrant.collection_exists(collection): + raise HTTPException(status_code=404, detail=f"Collection '{collection}' nicht gefunden.") + qdrant.delete_collection(collection_name=collection) + return DeleteResponse(status="🗑️ gelöscht", count=0, collection=collection) + +# ------------------------ +# Endpunkte für TrainingPlans +# ------------------------ +@app.post("/plan", response_model=TrainingPlan) +def create_plan(plan: TrainingPlan): + # Ensure plan collection exists + if not qdrant.collection_exists(PLAN_COLL): + qdrant.recreate_collection( + collection_name=PLAN_COLL, + vectors_config=VectorParams( + size=model.get_sentence_embedding_dimension(), + distance=Distance.COSINE + ) + ) + vec = model.encode(f"{plan.title}. {plan.short_description}").tolist() + payload = plan.dict() + qdrant.upsert(collection_name=PLAN_COLL, points=[PointStruct(id=plan.id, vector=vec, payload=payload)]) + return plan + +@app.get("/plan", response_model=List[TrainingPlan]) +def list_plans( + collection: str = Query(PLAN_COLL), + discipline: Optional[str] = Query(None), + group: Optional[str] = Query(None), + dojo: Optional[str] = Query(None) +): + if not qdrant.collection_exists(collection): + return [] + pts, _ = qdrant.scroll(collection_name=collection, limit=10000) + result: List[TrainingPlan] = [] + for pt in pts: + plan = TrainingPlan(**pt.payload) + if discipline and plan.discipline != discipline: continue + if group and plan.group != group: continue + if dojo and plan.dojo != dojo: continue + result.append(plan) + return result diff --git a/llm-api/test_delete_filters.sh b/llm-api/test_delete_filters.sh new file mode 100755 index 0000000..857b828 --- /dev/null +++ b/llm-api/test_delete_filters.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_URL="http://127.0.0.1:8000" +COL="filter_test" + +# Prfe, ob jq installiert ist +if ! command -v jq &>/dev/null; then + echo "Bitte installieren: sudo apt-get install -y jq" + exit 1 +fi + +# Funktion zum Einfgen eines vollstndigen Chunks +function test_embed_full { + local SRC=$1; local OWNER=$2; local CAT=$3; local TEXT=$4 + local IDX=$5 + payload=$(jq -n \ + --arg col "$COL" \ + --arg txt "$TEXT" \ + --arg src "$SRC" \ + --arg owner "$OWNER" \ + --arg cat "$CAT" \ + --argjson idx $IDX '{ + collection: $col, + chunks: [ + { + text: $txt, + source: $src, + source_type: "unit-test", + title: $txt, + version: "v1", + related_to: "rel", + tags: ["t1","t2"], + owner: $owner, + context_tag: "ctx", + imported_at: "2025-08-06T00:00:00Z", + chunk_index: $idx, + category: $cat + } + ] + }') + r=$(curl -s -X POST "${BASE_URL}/embed" -H "Content-Type: application/json" -d "$payload") + cnt=$(echo "$r" | jq -r '.count // 0') + if [[ "$cnt" -ne 1 ]]; then echo "/embed fehlgeschlagen: $r"; exit 1; fi + echo "? Embed idx=$IDX src=$SRC owner=$OWNER cat=$CAT text=$TEXT" +} + +# Funktion zum Lschenstest +function assert_delete { + local DESC=$1; shift + local PARAMS=$1; shift + local EXPECT=$1; shift + echo -n "Test: ${DESC} ... " + local resp=$(curl -s -X DELETE "${BASE_URL}/delete-source?collection=${COL}&${PARAMS}") + local cnt=$(echo "$resp" | jq -r '.count // 0') + if [[ "$cnt" -eq "$EXPECT" ]]; then + echo "? gelscht=${cnt}" + else + echo "? gelscht=${cnt}, erwartet=${EXPECT}" + echo " Response: $resp" + exit 1 + fi +} + +# Pause-Funktion +function pause { + + read -p "Drcke [Enter] um fortzufahren ..." +} + +# 1) Initialisiere Collection +echo "=== 1) Initialisiere Test-Collection ===" +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true +pause + +# 2) Einfgen von 4 Punkten +echo "=== 2) Einfgen von Testdaten ===" +test_embed_full s1 o1 c1 A 0 +test_embed_full s2 o2 c2 B 1 +test_embed_full s1 o2 c1 C 2 +test_embed_full s2 o1 c2 D 3 +pause + +# 3) Einzelkriterien (jeder Test auf frischer Datenbasis) +echo "=== 3) Einzelkriterien ===" +for test in \ + "source=s1" "source=s2" "owner=o1" "owner=o2" "category=c1" "category=c2"; do + # Reset Collection + curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true + # Neu befllen + test_embed_full s1 o1 c1 A 0 + test_embed_full s2 o2 c2 B 1 + test_embed_full s1 o2 c1 C 2 + test_embed_full s2 o1 c2 D 3 + # Param und Expected bestimmen + KEY=${test%%=*} + VALUE=${test#*=} + EXPECT=2 + echo -n "Test: ${test} ... " + resp=$(curl -s -X DELETE "${BASE_URL}/delete-source?collection=${COL}&${test}") + cnt=$(echo "$resp" | jq -r '.count // 0') + if [[ "$cnt" -eq "$EXPECT" ]]; then + echo "? gelscht=${cnt}" + else + echo "? gelscht=${cnt}, erwartet=${EXPECT}"; echo " Response: $resp"; exit 1 + fi + pause +done + +# 4) Zwei Kriterien +echo "=== 4) Zwei Kriterien ===" +for params in \ + "source=s1&owner=o2" "source=s2&category=c2" "owner=o1&category=c2"; do + curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true + test_embed_full s1 o1 c1 A 0 + test_embed_full s2 o2 c2 B 1 + test_embed_full s1 o2 c1 C 2 + test_embed_full s2 o1 c2 D 3 + assert_delete "$params" "$params" 1 + pause +done + +# 5) Drei Kriterien +echo "=== 5) Drei Kriterien ===" +for params in "source=s1&owner=o2&category=c1"; do + curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true + test_embed_full s1 o1 c1 A 0 + test_embed_full s2 o2 c2 B 1 + test_embed_full s1 o2 c1 C 2 + test_embed_full s2 o1 c2 D 3 + assert_delete "$params" "$params" 1 + pause +done + +# 6) Kein Filter (400 erwartet) +echo "=== 4) Zwei Kriterien ===" +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true +# Neu befllen +test_embed_full s1 o1 c1 A 0 +test_embed_full s2 o2 c2 B 1 +test_embed_full s1 o2 c1 C 2 +test_embed_full s2 o1 c2 D 3 +pause +assert_delete "s1+o2" "source=s1&owner=o2" 1 +assert_delete "s2+c2" "source=s2&category=c2" 1 +assert_delete "o1+c2" "owner=o1&category=c2" 1 +pause + +# 5) Drei Kriterien +echo "=== 5) Drei Kriterien ===" +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true +# Neu befllen +test_embed_full s1 o1 c1 A 0 +test_embed_full s2 o2 c2 B 1 +test_embed_full s1 o2 c1 C 2 +test_embed_full s2 o1 c2 D 3 +pause +assert_delete "s1+o2+c1" "source=s1&owner=o2&category=c1" 1 +pause + +# 6) Kein Filter (400 erwartet) +echo "=== 6) Kein Filter (400 erwartet) ===" +echo -n "Test: no-filter ... " +local code=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "${BASE_URL}/delete-source?collection=${COL}") +if [[ "$code" -eq 400 ]]; then echo "? HTTP 400"; else echo "? HTTP ${code}"; exit 1; fi +pause + +# 7) Cleanup +echo "=== 7) Cleanup ===" +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=${COL}" || true +echo "? Collection gelscht" + +echo "?? Test abgeschlossen! ??" diff --git a/llm-api/test_exercise_idempotent.sh b/llm-api/test_exercise_idempotent.sh new file mode 100755 index 0000000..4649161 --- /dev/null +++ b/llm-api/test_exercise_idempotent.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +set -euo pipefail + +API="http://localhost:8000" + +echo "=== Cleanup collection ===" +curl -s -X DELETE "$API/delete-collection?collection=exercises" | jq -r '.status' || true + +echo +echo "=== Create baseline (with external_id) ===" +EXT_ID="mw:pageid:218" +PAYLOAD_1=$(cat <<'JSON' +{ + "title": "Affenklatschen", + "summary": "Mobilisierung der Schulter", + "short_description": "Mobilisierung der Schulter", + "keywords": ["Aufwärmen","Dehnen","Mobilisierung","Schulter"], + "link": "https://karatetrainer.net/index.php?title=Affenklatschen", + "discipline": "Allgemein", + "group": "1", + "age_group": "Kinder, Schüler, Teenager, Erwachsene", + "target_group": "Breitensportler", + "min_participants": 1, + "duration_minutes": 1, + "capabilities": {"Flexibilität":1,"Kopplungsfähigkeit":1}, + "category": "Übungen", + "purpose": "Mobilisierung der Schulter", + "execution": "Beschreibung A", + "notes": "Hinweise A", + "preparation": "Dynamisches Dehnen", + "method": "", + "equipment": [], + "fullurl": "https://karatetrainer.net/index.php?title=Affenklatschen", + "external_id": "mw:pageid:218", + "source": "MediaWiki", + "source_version": "rev-1", + "fingerprint": "fp-1" +} +JSON +) +curl -s -X POST "$API/exercise" -H "Content-Type: application/json" -d "$PAYLOAD_1" | jq -r '.status // "ok"' + +echo +echo "=== Upsert same external_id with changed content (should update, not duplicate) ===" +PAYLOAD_2=$(echo "$PAYLOAD_1" | jq '.summary="NEU: Mobilisierung der Schulter (Update)" | .source_version="rev-2" | .fingerprint="fp-2"') +curl -s -X POST "$API/exercise" -H "Content-Type: application/json" -d "$PAYLOAD_2" | jq -r '.status // "ok"' + +echo +echo "=== Check by external_id ===" +curl -s "$API/exercise/by-external-id?external_id=$EXT_ID" | jq + +echo +echo "=== Count entries (should be 1) ===" +COUNT=$(curl -s "$API/exercise" | jq '[.[] | select(.title=="Affenklatschen")] | length') +echo "Count=$COUNT" +if [ "$COUNT" != "1" ]; then + echo "❌ Expected 1, got $COUNT" + exit 1 +fi + +echo +echo "=== Verify updated summary ===" +CUR_SUMMARY=$(curl -s "$API/exercise/by-external-id?external_id=$EXT_ID" | jq -r '.payload.summary') +echo "Summary=$CUR_SUMMARY" +if [[ "$CUR_SUMMARY" != "NEU: Mobilisierung der Schulter (Update)" ]]; then + echo "❌ Update did not apply" + exit 1 +fi + +echo +echo "✅ Idempotent upsert OK" diff --git a/llm-api/test_exercise_plan.sh b/llm-api/test_exercise_plan.sh new file mode 100755 index 0000000..f7a7756 --- /dev/null +++ b/llm-api/test_exercise_plan.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE_URL="http://127.0.0.1:8000" +EX_COL="exercises" +PL_COL="training_plans" + +# Utility: assert status code +function assert_status { + local code=$1; local expect=$2; local body=$3 + if [[ "$code" -ne "$expect" ]]; then + echo "? Unerwarteter HTTP-Status: $code, erwartet $expect" + echo "Response Body: $body" + exit 1 + fi +} + +# Utility: die Antwort per jq extrahieren +function jqf { echo "$1" | jq -r "$2"; } + +echo "=== 1) Clean up Collections ===" +curl -s -X DELETE "$BASE_URL/delete-collection?collection=$EX_COL" || true +curl -s -X DELETE "$BASE_URL/delete-collection?collection=$PL_COL" || true +echo "? Alle Collections gelscht" + +echo +echo "=== 2) POST /exercise (Create Exercise) ===" +EX_PAYLOAD=$(jq -n '{ + title: "Kniebeuge", + summary: "Partnerbung fr Stabilitt", + short_description: "Partner drckt von vorne", + keywords: ["Kraft","Partner"], + link: "http://wiki/uebung/kniebeuge", + discipline: "Karate", + group: "Mittelstufe", + age_group: "Erwachsene", + target_group: "Breitensport", + min_participants: 2, + duration_minutes: 5, + capabilities: {"strength":3,"balance":2}, + category: "Grundbung", + purpose: "Strkung Beine", + execution: "Langsam herabsenken", + notes: "Rcken gerade halten", + preparation: "Partnerposition", + method: "Partnerwiderstand", + equipment: ["Partner"] +}') +R=$(curl -s -w "\n%{http_code}" -X POST "$BASE_URL/exercise" -H "Content-Type: application/json" -d "$EX_PAYLOAD") +BODY=$(echo "$R" | sed '$d') +CODE=$(echo "$R" | tail -n1) +assert_status $CODE 200 "$BODY" +EX_ID=$(jqf "$BODY" '.id') +echo "? Exercise erstellt mit id=$EX_ID" + +echo +echo "=== 3) GET /exercise (List & Filter) ===" +# 3a) ohne Filter +FULL=$(curl -s -X GET "$BASE_URL/exercise") +COUNT=$(echo "$FULL" | jq 'length') +if [[ "$COUNT" -ne 1 ]]; then + echo "? /exercise returned $COUNT entries, expected 1" + exit 1 +fi +echo "? /exercise list returns $COUNT Eintrag" + +# 3b) filter discipline +F=$(curl -s -G "$BASE_URL/exercise" --data-urlencode "discipline=Karate") +if [[ "$(echo "$F" | jq '.[0].id')" != "\"$EX_ID\"" ]]; then + echo "? discipline filter fehlgeschlagen" + exit 1 +fi +echo "? Filter discipline funktioniert" + +# 3c) filter tags +F2=$(curl -s -G "$BASE_URL/exercise" --data-urlencode "tags=Partner") +if [[ "$(echo "$F2" | jq '.[0].id')" != "\"$EX_ID\"" ]]; then + echo "? tags filter fehlgeschlagen" + exit 1 +fi +echo "? Filter tags funktioniert" + +echo +echo "=== 4) POST /plan (Create TrainingPlan) ===" +PLAN_PAYLOAD=$(jq -n --arg exid "$EX_ID" '{ + title: "Bein-Training", + short_description: "Stabilitt und Kraft", + collection: "training_plans", + discipline: "Karate", + group: "Mittelstufe", + dojo: "Dojo A", + date: "2025-08-10", + plan_duration_weeks: 4, + focus_areas: ["Kraft","Technik"], + predecessor_plan_id: null, + age_group: "Erwachsene", + phases: [ + { + name: "Aufwrmen", + duration_minutes: 10, + method: "Laufen", + method_notes: "locker", + exercises: [ + { + exercise_id: $exid, + cond_load: {"reps":5}, + coord_load: {"balance":2}, + instructions: "Langsam ausfhren" + } + ] + } + ] +}') +R2=$(curl -s -w "\n%{http_code}" -X POST "$BASE_URL/plan" -H "Content-Type: application/json" -d "$PLAN_PAYLOAD") +BODY2=$(echo "$R2" | sed '$d') +CODE2=$(echo "$R2" | tail -n1) +assert_status $CODE2 200 "$BODY2" +PL_ID=$(jqf "$BODY2" '.id') +echo "? Plan erstellt mit id=$PL_ID" + +echo +echo "=== 5) GET /plan (List & Filter) ===" +# 5a) ohne Filter +L=$(curl -s -G "$BASE_URL/plan") +if [[ "$(echo "$L" | jq 'length')" -ne 1 ]]; then + echo "? /plan returned $(echo $L | jq 'length') entries, expected 1" + exit 1 +fi +echo "? /plan list returns 1 Eintrag" + +# 5b) filter discipline +LF=$(curl -s -G "$BASE_URL/plan" --data-urlencode "discipline=Karate") +if [[ "$(echo "$LF" | jq '.[0].id')" != "\"$PL_ID\"" ]]; then + echo "? discipline filter for /plan failed" + exit 1 +fi +echo "? /plan discipline filter funktioniert" + +# 5c) filter group +LG=$(curl -s -G "$BASE_URL/plan" --data-urlencode "group=Mittelstufe") +if [[ "$(echo "$LG" | jq '.[0].id')" != "\"$PL_ID\"" ]]; then + echo "? group filter for /plan failed" + exit 1 +fi +echo "? /plan group filter funktioniert" + +# Cleanup +echo +echo "=== Cleanup Collections ===" +curl -s -X DELETE "$BASE_URL/delete-collection?collection=$EX_COL" || true +curl -s -X DELETE "$BASE_URL/delete-collection?collection=$PL_COL" || true +echo "? Cleanup done" + +echo +echo "?? Alle Tests fr Exercises & Plans erfolgreich! ??" diff --git a/llm-api/test_llm_api.sh b/llm-api/test_llm_api.sh new file mode 100755 index 0000000..9becf23 --- /dev/null +++ b/llm-api/test_llm_api.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +import sys +import requests + +BASE_URL = "http://127.0.0.1:8000" +COL = "test_collection" +SRC = "unit-test-src" + +def fail(msg): + print("✗", msg) + sys.exit(1) + +def test_openapi(): + r = requests.get(f"{BASE_URL}/openapi.json") + if r.status_code != 200: + fail(f"/openapi.json returned {r.status_code}") + print("✓ OpenAPI: 200 OK") + +def test_embed(): + payload = { + "collection": COL, + "chunks": [ + {"text": "Das ist ein Testtext für Embed.", "source": SRC} + ] + } + r = requests.post(f"{BASE_URL}/embed", json=payload) + if r.status_code != 200: + fail(f"/embed returned {r.status_code}: {r.text}") + data = r.json() + if data.get("count") != 1: + fail(f"/embed count != 1: {data}") + print("✓ Embed: 1 Eintrag gespeichert") + +def test_search(): + params = {"query": "Testtext", "collection": COL} + r = requests.get(f"{BASE_URL}/search", params=params) + if r.status_code != 200: + fail(f"/search returned {r.status_code}: {r.text}") + results = r.json() + if not any("score" in item for item in results): + fail(f"/search lieferte keine Treffer: {results}") + print("✓ Search: Treffer gefunden") + +def test_prompt(): + payload = {"query": "Wie lautet dieser Testtext?", "context_limit": 1, "collection": COL} + r = requests.post(f"{BASE_URL}/prompt", json=payload) + if r.status_code != 200: + fail(f"/prompt returned {r.status_code}: {r.text}") + data = r.json() + if "answer" not in data: + fail(f"/prompt liefert kein 'answer'-Feld: {data}") + print("✓ Prompt: Antwort erhalten") + +def test_delete_source(): + params = {"collection": COL, "source": SRC} + r = requests.delete(f"{BASE_URL}/delete-source", params=params) + if r.status_code != 200: + fail(f"/delete-source returned {r.status_code}: {r.text}") + data = r.json() + if data.get("count") != 1: + fail(f"/delete-source count != 1: {data}") + print("✓ Delete-source: 1 Eintrag gelöscht") + +def test_delete_collection(): + params = {"collection": COL} + r = requests.delete(f"{BASE_URL}/delete-collection", params=params) + if r.status_code != 200: + fail(f"/delete-collection returned {r.status_code}: {r.text}") + print("✓ Delete-collection: Collection gelöscht") + +if __name__ == "__main__": + print("\nStarte API-Tests...\n") + test_openapi() + test_embed() + test_search() + test_prompt() + test_delete_source() + test_delete_collection() + print("\n🎉 Alle Tests erfolgreich durchlaufen!") diff --git a/llm-api/test_llm_api_full.sh b/llm-api/test_llm_api_full.sh new file mode 100755 index 0000000..ac948de --- /dev/null +++ b/llm-api/test_llm_api_full.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 1) Basis-URL Deines FastAPI-Servers +export BASE_URL="http://localhost:8000" + +# 2) MediaWiki-Zugangsdaten +export WIKI_API_URL="https://karatetrainer.net/api.php" +export WIKI_BOT_USER="LarsS@APIBot" +export WIKI_BOT_PASSWORD="6snci781sh79tbmvb2u9ld4bkd1i7n5t" + +echo -e "\n?? Starte Health-Check fr MediaWiki" +HTTP=$(curl -s -o /dev/null -w '%{http_code}' "${BASE_URL}/import/wiki/health") +if [[ "$HTTP" != "200" ]]; then + echo "? Health-Check fehlgeschlagen (HTTP $HTTP)" + exit 1 +fi +echo "? MediaWiki-Health OK (200)" + +echo -e "\n?? Teste MediaWiki-Login" +LOGIN_RESP=$(curl -s -X POST "${BASE_URL}/import/wiki/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${WIKI_BOT_USER}\",\"password\":\"${WIKI_BOT_PASSWORD}\"}" ) +echo "? Login-Response: $LOGIN_RESP" +# Optional: prfen, ob "success" enthalten ist +if [[ "$LOGIN_RESP" != *"success"* ]]; then + echo "? MediaWiki-Login fehlgeschlagen" + exit 1 +fi +echo "? MediaWiki-Login erfolgreich" + +echo -e "\n?? Testet bestehenden /exercise-Endpoint" +# Collection exercises aufrumen +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=exercises" || true >/dev/null + +# Erzeugen +CREATE=$(curl -s -X POST "${BASE_URL}/exercise" \ + -H 'Content-Type: application/json' \ + -d '{ + "title":"Testbung", + "summary":"Zusammenfassung", + "short_description":"Kurz", + "discipline":"Test", + "age_group":"Erwachsene", + "target_group":"Tester", + "min_participants":1, + "duration_minutes":5, + "category":"test", + "purpose":"Demo", + "execution":"Ausfhren", + "notes":"", + "preparation":"", + "method":"", + "equipment":[] + }') +echo "? Create bung: $CREATE" +if [[ "$CREATE" != *"Testbung"* ]]; then + echo "? bung wurde nicht angelegt" + exit 1 +fi +echo "? /exercise POST OK" + +# Auflisten +LIST=$(curl -s "${BASE_URL}/exercise?discipline=Test") +if [[ "$LIST" != *"Testbung"* ]]; then + echo "? /exercise GET liefert nicht die Testbung" + exit 1 +fi +echo "? /exercise GET OK" + +echo -e "\n?? Testet Embed/Search" +# Einen Chunk einbetten +curl -s -X POST "${BASE_URL}/embed" \ + -H 'Content-Type: application/json' \ + -d '{ + "chunks": [ + {"text":"Hallo Welt","source":"test","source_type":"txt","title":"T","version":"1","related_to":"","tags":[],"owner":""} + ], + "collection":"default" + }' >/dev/null + +# Suche +SEARCH=$(curl -s "${BASE_URL}/search?query=Hallo") +if [[ "$SEARCH" != *"Hallo Welt"* ]]; then + echo "? /search liefert unerwartetes Ergebnis" + exit 1 +fi +echo "? /search OK" + +echo -e "\n?? Alle Tests erfolgreich abgeschlossen!" diff --git a/llm-api/test_llm_api_wiki.sh b/llm-api/test_llm_api_wiki.sh new file mode 100755 index 0000000..da589a6 --- /dev/null +++ b/llm-api/test_llm_api_wiki.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Basis-URL (ggf. anpassen) +BASE_URL="${BASE_URL:-http://localhost:8000}" + +echo "1?? MediaWiki Health-Check" +HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' "${BASE_URL}/import/wiki/health") +if [[ "$HTTP_CODE" != "200" ]]; then + echo "? Health-Check fehlgeschlagen (HTTP $HTTP_CODE)" + exit 1 +fi +echo "? Health-Check OK (HTTP 200)" + +echo +echo "2?? Exercise-CRU-Test (Stichprobe)" +# Erst lschen, falls Test-Exercise bereits existiert +curl -s -X DELETE "${BASE_URL}/delete-collection?collection=exercises" > /dev/null || true + +# Create +CREATE_RESP=$(curl -s -X POST "${BASE_URL}/exercise" \ + -H 'Content-Type: application/json' \ + -d '{"title":"Testbung","summary":"Zusammenfassung","short_description":"Kurz","discipline":"Test","age_group":"Erwachsene","target_group":"Tester","min_participants":1,"duration_minutes":5,"category":"test","purpose":"Test","execution":"Ausfhren","notes":"","preparation":"","method":"","equipment":[]}' ) + +echo "? Create-Response: $CREATE_RESP" + +# List +LIST_RESP=$(curl -s "${BASE_URL}/exercise?discipline=Test") +if [[ "$LIST_RESP" != *"Testbung"* ]]; then + echo "? Exercise-List nicht wie erwartet" + exit 1 +fi +echo "? Exercise angelegt und gefunden" + +echo +echo "3?? Search-Endpoint-Test" +# Indexiere manuell einen Chunk +curl -s -X POST "${BASE_URL}/embed" \ + -H 'Content-Type: application/json' \ + -d '{"chunks":[{"text":"Hallo Welt","source":"test","version":"1","title":"T","related_to":"","tags":[],"owner":""}],"collection":"default"}' \ + > /dev/null + +SEARCH_RESP=$(curl -s "${BASE_URL}/search?query=Hallo") +if [[ "$SEARCH_RESP" != *"Hallo Welt"* ]]; then + echo "? Search-Ergebnis nicht korrekt" + exit 1 +fi +echo "? Search liefert erwarteten Treffer" + +echo +echo "? Alle Tests erfolgreich durchgelaufen." diff --git a/llm-api/test_wiki_ep1.sh b/llm-api/test_wiki_ep1.sh new file mode 100755 index 0000000..b43c998 --- /dev/null +++ b/llm-api/test_wiki_ep1.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE="${BASE_URL:-http://localhost:8000/import/wiki}" + +# 1) Health-Check +echo "? Health-Check" +curl -s -o /dev/null -w '%{http_code}\n' "${BASE}/health" + +# 2) Login (damit Session-Cookies gesetzt werden, wenn Du das weiter nutzt) +echo -e "\n? Login" +curl -s -X POST "${BASE}/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${WIKI_BOT_USER}\",\"password\":\"${WIKI_BOT_PASSWORD}\"}" | jq + +# 3) Import aus Kategorie +echo -e "\n? Importiere bungen aus bungen" +RESP=$(curl -s -G "${BASE}/import/exercises" \ + --data-urlencode "category=bungen") +echo "$RESP" | jq + +# 4) Ergebnis prfen +IMPORTED=$(echo "$RESP" | jq '.imported | length') +echo -e "\n? Anzahl importierter bungen: $IMPORTED" diff --git a/llm-api/test_wiki_exercises.sh b/llm-api/test_wiki_exercises.sh new file mode 100755 index 0000000..cdf9b38 --- /dev/null +++ b/llm-api/test_wiki_exercises.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +BASE="http://localhost:8000/import/wiki" + +echo "? 1) Health-Check" +curl -s -o /dev/null -w '%{http_code}\n' "${BASE}/health" + +echo -e "\n? 2) Login" +curl -s -X POST "${BASE}/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${WIKI_BOT_USER}\",\"password\":\"${WIKI_BOT_PASSWORD}\"}" | jq + +echo -e "\n? 3) SMW-Ask: Alle bungen abfragen und Titel ausgeben" +curl -s -G "${BASE}/semantic/pages" \ + --data-urlencode "category=bungen" \ +| jq -r 'keys[]' + +COUNT=$(curl -s -G "${BASE}/semantic/pages" \ + --data-urlencode "category=bungen" \ +| jq 'keys | length') +echo -e "\n? Anzahl bungen: ${COUNT}" + +echo -e "\n? 4) Details der ersten bung" +FIRST_TITLE=$(curl -s -G "${BASE}/semantic/pages" \ + --data-urlencode "category=bungen" \ +| jq -r 'keys[0]') +echo "Erste bung: ${FIRST_TITLE}" + +FIRST_PAGEID=$(curl -s -G "${BASE}/semantic/pages" \ + --data-urlencode "category=bungen" \ +| jq -r ".\"${FIRST_TITLE}\".pageid") +echo "PageID: ${FIRST_PAGEID}" + +echo -e "\n? 5) Wikitext der ersten bung" +curl -s -G "${BASE}/pagecontent" \ + --data-urlencode "pageid=${FIRST_PAGEID}" \ + --data-urlencode "title=${FIRST_TITLE}" \ +| jq '.content' + +echo -e "\n? Test abgeschlossen." diff --git a/llm-api/test_wiki_router.sh b/llm-api/test_wiki_router.sh new file mode 100755 index 0000000..961d1db --- /dev/null +++ b/llm-api/test_wiki_router.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# test_wiki_router.sh — End-to-end Tests für wiki_router-Endpunkte (Health, Login, SMW-Ask, Page Info/Parse/Detail) +# Voraussetzungen: +# - laufende FastAPI (uvicorn) unter http://localhost:8000 +# - jq installiert +# - optional: .env im selben Verzeichnis mit WIKI_BOT_USER / WIKI_BOT_PASSWORD +set -euo pipefail +export LC_ALL=C.UTF-8 LANG=C.UTF-8 + +API_BASE="http://localhost:8000/import/wiki" +CATEGORY_DEFAULT="Übungen" +CATEGORY="${WIKI_CATEGORY:-$CATEGORY_DEFAULT}" + +# --- Hilfsfunktionen --- +die() { echo " ❌ $*" >&2; exit 1; } +ok() { echo " ✅ $*"; } +info(){ echo " → $*"; } + +# .env laden (falls vorhanden) +if [[ -f .env ]]; then + set -o allexport; source .env; set +o allexport +fi + +# Debug: Maskiertes Echo der ENV +echo "DBG: User=${WIKI_BOT_USER:-}, Pass=$( [[ -n "${WIKI_BOT_PASSWORD:-}" ]] && echo set || echo unset )" + +echo "1) Health-Check" +HC_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$API_BASE/health" | tr -d '\r') +[[ "$HC_CODE" == "200" ]] || die "Health failed (HTTP $HC_CODE)" +ok "Health OK" + +echo +echo "2) Login" +[[ -n "${WIKI_BOT_USER:-}" && -n "${WIKI_BOT_PASSWORD:-}" ]] || die "ENV nicht gesetzt (WIKI_BOT_USER / WIKI_BOT_PASSWORD)" + +# JSON sicher bauen +LOGIN_JSON=$(jq -nc --arg u "$WIKI_BOT_USER" --arg p "$WIKI_BOT_PASSWORD" '{username:$u, password:$p}') + +# Request ausführen (mit klarer Fehleranzeige) +RAW_LOGIN_RESP=$(curl -sS -X POST "$API_BASE/login" -H 'Content-Type: application/json' -d "$LOGIN_JSON" | tr -d '\r' || true) + +# Prüfen, ob es valides JSON ist +if ! echo "$RAW_LOGIN_RESP" | jq . >/dev/null 2>&1; then + echo "$RAW_LOGIN_RESP" + die "Login-Response ist kein valides JSON (oben roh ausgegeben)" +fi + +echo "$RAW_LOGIN_RESP" | jq . +STATUS=$(echo "$RAW_LOGIN_RESP" | jq -r '.status // empty') +[[ "$STATUS" == "success" ]] || die "Login failed: $(echo "$RAW_LOGIN_RESP" | jq -r '.message // "Login fehlgeschlagen"')" +ok "Login successful" + +echo +echo "3) SMW-Ask: Alle Übungen zählen" +COUNT_JSON=$(curl -s -G "$API_BASE/semantic/pages" --data-urlencode "category=$CATEGORY" | tr -d '\r' || true) +if ! echo "$COUNT_JSON" | jq . >/dev/null 2>&1; then + echo "$COUNT_JSON" + die "SMW-Ask Count: Response ist kein valides JSON" +fi +EXERCISE_COUNT=$(echo "$COUNT_JSON" | jq 'keys | length') +info "$EXERCISE_COUNT Übungen gefunden" + +echo +echo "4) SMW-Ask: Erste 5 Übungstitel" +FIRST5_JSON=$(curl -s -G "$API_BASE/semantic/pages" --data-urlencode "category=$CATEGORY" | tr -d '\r' || true) +if ! echo "$FIRST5_JSON" | jq . >/dev/null 2>&1; then + echo "$FIRST5_JSON" + die "SMW-Ask Titel: Response ist kein valides JSON" +fi +echo "$FIRST5_JSON" | jq -r 'keys[0:5][]' | sed 's/^/ • /' + +TITLE="Affenklatschen" + +echo +echo "5) Core-API Info für \"$TITLE\"" +INFO_RESP=$(curl -s -G "$API_BASE/info" --data-urlencode "title=$TITLE" | tr -d '\r' || true) +if ! echo "$INFO_RESP" | jq . >/dev/null 2>&1; then + echo "$INFO_RESP" + die "Info: Response ist kein valides JSON" +fi +echo "$INFO_RESP" | jq +PAGEID=$(echo "$INFO_RESP" | jq -r '.pageid') +FULLURL=$(echo "$INFO_RESP" | jq -r '.fullurl // empty') +[[ "$PAGEID" =~ ^[0-9]+$ ]] || die "Info failed: pageid ungültig" +ok "pageid=$PAGEID, url=${FULLURL:-}" + +echo +echo "6) Parse-Endpoint für pageid=$PAGEID" +PARSE_RESP=$(curl -s -G "$API_BASE/parsepage" --data-urlencode "pageid=$PAGEID" --data-urlencode "title=$TITLE" | tr -d '\r' || true) +if ! echo "$PARSE_RESP" | jq . >/dev/null 2>&1; then + echo "$PARSE_RESP" + die "Parse: Response ist kein valides JSON" +fi +# Wikitext-Auszug (falls vorhanden) +WT=$(echo "$PARSE_RESP" | jq -r '.wikitext // ""') +printf '%s\n' "${WT:0:200}…" +ok "Wikitext (erster Ausschnitt) geladen" + +echo +echo "7) Detail-Endpoint für \"$TITLE\"" +DETAIL_RESP=$(curl -s -G "$API_BASE/semantic/page" --data-urlencode "category=$CATEGORY" --data-urlencode "title=$TITLE" | tr -d '\r' || true) +if ! echo "$DETAIL_RESP" | jq . >/dev/null 2>&1; then + echo "$DETAIL_RESP" + die "Detail: Response ist kein valides JSON" +fi +echo "$DETAIL_RESP" | jq '{title: .title, pageid: .pageid, fullurl: .fullurl, printouts: .printouts, wikitext_length: (.wikitext|length)}' +ok "Detail-Endpoint liefert erwartete Felder" + +echo +echo "🎉 Alle Tests erfolgreich abgeschlossen!" diff --git a/llm-api/uvicorn.log b/llm-api/uvicorn.log new file mode 100644 index 0000000..fc61653 --- /dev/null +++ b/llm-api/uvicorn.log @@ -0,0 +1,10 @@ +nohup: ignoring input +[DEBUG] llm_api.py version 1.0.2 loaded from /home/llmadmin/llm-api/llm_api.py +[DEBUG] Using OLLAMA_URL = http://localhost:11434/api/generate +[DEBUG] Using OLLAMA_MODEL = mistral:latest +INFO: Started server process [54813] +INFO: Waiting for application startup. +INFO: Application startup complete. +ERROR: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8000): address already in use +INFO: Waiting for application shutdown. +INFO: Application shutdown complete. diff --git a/llm-api/wiki_cookies.txt b/llm-api/wiki_cookies.txt new file mode 100644 index 0000000..c31d989 --- /dev/null +++ b/llm-api/wiki_cookies.txt @@ -0,0 +1,4 @@ +# Netscape HTTP Cookie File +# https://curl.se/docs/http-cookies.html +# This file was generated by libcurl! Edit at your own risk. + diff --git a/llm-api/wiki_router.py b/llm-api/wiki_router.py new file mode 100644 index 0000000..8f224fc --- /dev/null +++ b/llm-api/wiki_router.py @@ -0,0 +1,173 @@ +""" +File: wiki_router.py +Beschreibung: +- Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask. + * /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab. + * /info: Liefert pageid und fullurl über Core-API Query. + * /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API. +Version: 1.2.0 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import Dict, Any, List +import requests, os + +__version__ = "1.2.0" +router = APIRouter() + +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +wiki_session = requests.Session() + +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class PageContentResponse(BaseModel): + pageid: int + title: str + wikitext: str + +class PageInfoResponse(BaseModel): + pageid: int + title: str + fullurl: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, + timeout=5 + ) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # Token holen + try: + token_resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, + timeout=10 + ) + token_resp.raise_for_status() + token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post( + WIKI_API_URL, + data={ + "action": "clientlogin", + "format": "json", + "username": data.username, + "password": data.password, + "logintoken": token, + "loginreturnurl": "http://localhost:8000" + }, + timeout=10 + ) + login_resp.raise_for_status() + status = login_resp.json().get("clientlogin", {}).get("status") + except Exception: + status = None + # fallback login + if status != "PASS": + alt = wiki_session.post( + WIKI_API_URL, + data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password}, + timeout=10 + ) + alt.raise_for_status() + status = alt.json().get("login", {}).get("result") + if status in ("PASS", "Success"): + return WikiLoginResponse(status="success", message=None) + return WikiLoginResponse(status="failed", message="Login fehlgeschlagen") + +# SMW-Ask: alle Übungen inkl. Unterkategorien +@router.get("/semantic/pages") +def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: + smw_query = f"[[Category:{category}]]" + ask_query = f"{smw_query}|limit=50000" + r = wiki_session.get( + WIKI_API_URL, + params={"action": "ask", "query": ask_query, "format": "json"}, + timeout=30 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + return r.json().get("query", {}).get("results", {}) + +# Wikitext über parse-Endpoint holen (per pageid) +@router.get("/parsepage", response_model=PageContentResponse) +def parse_page(pageid: int = Query(...), title: str = Query(None)): + r = wiki_session.get( + WIKI_API_URL, + params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, + timeout=20 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") + wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") + return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) + +# Pageinfo über Core-API (ermittelt pageid + fullurl) +@router.get("/info", response_model=PageInfoResponse) +def page_info(title: str = Query(..., description="Name der Seite")): + r = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"}, + timeout=10 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Info-Error: {e}") + pages = r.json().get("query", {}).get("pages", {}) + pid_str, page = next(iter(pages.items())) + pid = int(pid_str) + fullurl = page.get("fullurl") + return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl) + +# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API +@router.get("/semantic/page", response_model=Dict[str, Any]) +def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]: + # Metadaten aus SMW-Ask + entries = semantic_pages(category) + entry = entries.get(title) + if not entry: + raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.") + # Pageinfo via Core-API + info = page_info(title=title) + # Wikitext via parse + parsed = parse_page(pageid=info.pageid, title=title) + return { + "title": title, + "pageid": info.pageid, + "fullurl": info.fullurl, + "printouts": entry.get("printouts", {}), + "wikitext": parsed.wikitext + } diff --git a/llm-api/wiki_router0.1.1.py b/llm-api/wiki_router0.1.1.py new file mode 100644 index 0000000..71adf2c --- /dev/null +++ b/llm-api/wiki_router0.1.1.py @@ -0,0 +1,110 @@ +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List +import requests, os + +# Version hochgezählt +__version__ = "1.1.6" +router = APIRouter() + +# MediaWiki-Konfiguration +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +wiki_session = requests.Session() + +# Models +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class CategoryMembersResponse(BaseModel): + pageid: int + title: str + +class PageContentResponse(BaseModel): + pageid: int + title: str + content: str + +# Health-Check +@router.get("/health") +def health_check(): + params = {"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"} + try: + resp = wiki_session.get(WIKI_API_URL, params=params, timeout=5) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + """ + Führt Login mittels MediaWiki Bot-Password API durch. + Username kann im Format 'User@BotName' übergeben werden. + """ + # Verarbeite Bot-Password-Format + lgname = data.username + lgpassword = data.password + if '@' in data.username: + user, bot = data.username.split('@',1) + lgname = user + lgpassword = f"{bot}@{data.password}" + # Schritt 1: Login-Token holen + params_token = {"action": "query", "meta": "tokens", "type": "login", "format": "json"} + try: + r1 = wiki_session.get(WIKI_API_URL, params=params_token, timeout=10) + r1.raise_for_status() + token = r1.json().get("query", {}).get("tokens", {}).get("logintoken") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + # Schritt 2: Login durchführen mit BotPasswort + login_data = { + "action": "login", + "format": "json", + "lgname": lgname, + "lgpassword": lgpassword, + "lgtoken": token + } + try: + r2 = wiki_session.post(WIKI_API_URL, data=login_data, timeout=10) + r2.raise_for_status() + result = r2.json().get("login", {}) + if result.get("result") != "Success": + return WikiLoginResponse(status="failed", message=result.get("reason")) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Login-Error: {e}") + return WikiLoginResponse(status="success", message=None) + +# 1) Kategorie abrufen +@router.get("/pages", response_model=List[CategoryMembersResponse]) +def list_category_members(category: str = Query(..., description="Name der Kategorie, ohne 'Category:'")): + cmtitle = f"Category:{category}" + params = {"action": "query", "list": "categorymembers", "cmtitle": cmtitle, "cmlimit": 500, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + members = r.json().get("query", {}).get("categorymembers", []) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return [CategoryMembersResponse(pageid=m["pageid"], title=m["title"]) for m in members] + +# 2) Seiteninhalt abrufen +@router.post("/pagecontent", response_model=PageContentResponse) +def get_page_content(pageid: int = Query(...), title: str = Query(None)): + params = {"action": "query", "prop": "revisions", "rvprop": "content", "rvslots": "main", "pageids": pageid, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + pages = r.json().get("query", {}).get("pages", {}) + page = pages.get(str(pageid), {}) + content = page.get("revisions", [{}])[0].get("slots", {}).get("main", {}).get("*", "") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Content-Error: {e}") + return PageContentResponse(pageid=pageid, title=title or page.get("title"), content=content) diff --git a/llm-api/wiki_router0.1.2.py b/llm-api/wiki_router0.1.2.py new file mode 100644 index 0000000..a2fe11a --- /dev/null +++ b/llm-api/wiki_router0.1.2.py @@ -0,0 +1,121 @@ +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List +import requests, os + +# Version hochgezählt +__version__ = "1.1.6" + +router = APIRouter() + +# MediaWiki-Konfiguration +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") +wiki_session = requests.Session() + +# Models +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class CategoryMembersResponse(BaseModel): + pageid: int + title: str + +class PageContentResponse(BaseModel): + pageid: int + title: str + content: str + +# Health-Check +@router.get("/health") +def health_check(): + params = {"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"} + try: + resp = wiki_session.get(WIKI_API_URL, params=params, timeout=5) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # Direkter Abgleich zu Testzwecken + if data.username == WIKI_BOT_USER and data.password == WIKI_BOT_PASSWORD: + return WikiLoginResponse(status="success", message=None) + return WikiLoginResponse(status="failed", message="Incorrect username or password.") + +# 1) Kategorie abrufen +@router.get("/pages", response_model=List[CategoryMembersResponse]) +def list_category_members(category: str = Query(..., description="Kategorie-Name ohne 'Category:'")): + cmtitle = f"Category:{category}" + params = {"action": "query", "list": "categorymembers", "cmtitle": cmtitle, "cmlimit": 500, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + members = r.json().get("query", {}).get("categorymembers", []) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return [CategoryMembersResponse(pageid=m["pageid"], title=m["title"]) for m in members] + +# 2) Seiteninhalt abrufen +@router.post("/pagecontent", response_model=PageContentResponse) +def get_page_content(pageid: int = Query(...), title: str = Query(None)): + params = {"action": "query", "prop": "revisions", "rvprop": "content", "rvslots": "main", "pageids": pageid, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + pages = r.json().get("query", {}).get("pages", {}) + page = pages.get(str(pageid), {}) + content = page.get("revisions", [{}])[0].get("slots", {}).get("main", {}).get("*", "") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Content-Error: {e}") + return PageContentResponse(pageid=pageid, title=title or page.get("title"), content=content) + +# 3) Importiere Übungen aus Kategorie +@router.post("/import/exercises") +def import_exercises(category: str = Query(..., description="Kategorie ohne 'Category:'")): + """ + Holt alle Seiten einer Kategorie, parsed deren Wikitext und importiert Übungen. + """ + import mwparserfromhell + # Schritt 1: Seitenliste + pages = list_category_members(category) + imported = [] + for p in pages: + # Schritt 2: Inhalt ziehen + pc = get_page_content(pageid=p.pageid, title=p.title) + wikicode = mwparserfromhell.parse(pc.content) + # Infobox parsen + templates = wikicode.filter_templates() + infobox = next((t for t in templates if t.name.strip() == 'ÜbungInfoBox'), None) + if not infobox: + continue + # Felder extrahieren + ex = { + 'title': infobox.get('title').value.strip() if infobox.has('title') else p.title, + 'summary': infobox.get('summary').value.strip() if infobox.has('summary') else '', + 'short_description': infobox.get('short_description').value.strip() if infobox.has('short_description') else '', + 'keywords': [kw.strip() for kw in infobox.get('keywords').value.split(',')] if infobox.has('keywords') else [], + 'link': None, + 'discipline': infobox.get('discipline').value.strip() if infobox.has('discipline') else '', + 'group': infobox.get('group').value.strip() if infobox.has('group') else None, + 'age_group': infobox.get('age_group').value.strip() if infobox.has('age_group') else '', + 'target_group': infobox.get('target_group').value.strip() if infobox.has('target_group') else '', + 'min_participants': int(infobox.get('min_participants').value.strip()) if infobox.has('min_participants') else 1, + 'duration_minutes': int(infobox.get('duration').value.strip()) if infobox.has('duration') else 0, + 'capabilities': {}, + 'category': category, + 'purpose': '', 'execution': '', 'notes': '', 'preparation': '', 'method': '', 'equipment': [] + } + # POST an Exercise-Endpoint + resp = requests.post(f"{os.getenv('APP_URL','http://localhost:8000')}/exercise", json=ex) + if resp.status_code == 200: + imported.append(resp.json().get('id')) + return {"imported": imported} diff --git a/llm-api/wiki_router1.1.5.py b/llm-api/wiki_router1.1.5.py new file mode 100644 index 0000000..f8ddfeb --- /dev/null +++ b/llm-api/wiki_router1.1.5.py @@ -0,0 +1,168 @@ +""" +File: wiki_router.py +Beschreibung: +- Enthält Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /pages: Listet Seiten einer Kategorie (Artikel im Namespace 0). + * /pagecontent: Ruft Wikitext einer Seite ab. + * /semantic/pages: Führt SMW-Ask-Abfrage aus. + * /import/exercises: Importiert Übungen per Infobox-Parsing. +Version: 1.1.6 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List, Dict +import requests, os + +# Version hochgezählt +__version__ = "1.1.6" + +router = APIRouter() + +# MediaWiki-Konfiguration +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") +wiki_session = requests.Session() + +# Models +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class CategoryMembersResponse(BaseModel): + pageid: int + title: str + +class PageContentResponse(BaseModel): + pageid: int + title: str + content: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, timeout=5) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # clientlogin Token holen + try: + token_resp = wiki_session.get(WIKI_API_URL, params={"action":"query","meta":"tokens","type":"login","format":"json"}, timeout=10) + token_resp.raise_for_status() + token = token_resp.json().get("query",{}).get("tokens",{}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post(WIKI_API_URL, data={"action":"clientlogin","format":"json","username":data.username,"password":data.password,"logintoken":token,"loginreturnurl":"http://localhost:8000"}, timeout=10) + login_resp.raise_for_status() + cl = login_resp.json().get("clientlogin", {}) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Login-Error: {e}") + if cl.get("status") == "PASS": + return WikiLoginResponse(status="success", message=None) + # fallback action=login + try: + alt = wiki_session.post(WIKI_API_URL, data={"action":"login","format":"json","lgname":data.username,"lgpassword":data.password}, timeout=10) + alt.raise_for_status() + res = alt.json().get("login",{}) + if res.get("result") == "Success": + return WikiLoginResponse(status="success", message=None) + else: + return WikiLoginResponse(status="failed", message=res.get("reason")) + except Exception as e: + return WikiLoginResponse(status="failed", message=str(e)) + +# List category members (Namespace 0) +@router.get("/pages", response_model=List[CategoryMembersResponse]) +def list_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")): + cmtitle = f"Category:{category}" + params = {"action":"query","list":"categorymembers","cmtitle":cmtitle,"cmnamespace":0,"cmlimit":50000,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + members = r.json().get("query",{}).get("categorymembers",[]) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return [CategoryMembersResponse(pageid=m["pageid"], title=m["title"]) for m in members] + +# Fetch page content +@router.post("/pagecontent", response_model=PageContentResponse) +def get_page_content(pageid: int = Query(...), title: str = Query(None)): + params = {"action":"query","prop":"revisions","rvprop":"content","rvslots":"main","pageids":pageid,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + pages = r.json().get("query",{}).get("pages",{}) + content = pages.get(str(pageid),{}).get("revisions",[{}])[0].get("slots",{}).get("main",{}).get("*","") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Content-Error: {e}") + return PageContentResponse(pageid=pageid, title=title or pages[str(pageid)].get("title"), content=content) + +# SMW-Ask query (rekursive Abfrage über Unterkategorien) +@router.get("/semantic/pages") +def semantic_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict: + """ + Führt eine rekursive SMW Ask-Abfrage durch, um strukturierte Daten für eine Kategorie inkl. Unterkategorien zu erhalten. + Limit und Format sind anpassbar. + """ + smw_query = f"[[Category:{category}]]" + # Ask-Parameter: query string mit Limit + ask_query = f"{smw_query}|limit=50000" + params = { + "action": "ask", + "query": ask_query, + "format": "json" + } + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=20) + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + return r.json() + +# Import exercises +@router.get("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises_get(category: str = Query(...)): + return import_exercises(category) + +@router.post("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises(category: str = Query(...)): + import mwparserfromhell + imported = [] + for p in list_category_members(category): + pc = get_page_content(pageid=p.pageid, title=p.title) + wikicode = mwparserfromhell.parse(pc.content) + infobox = next((t for t in wikicode.filter_templates() if t.name.strip()=="ÜbungInfoBox"),None) + if not infobox: + continue + ex = { 'title': infobox.get('title').value.strip() if infobox.has('title') else p.title, + 'summary': infobox.get('summary').value.strip() if infobox.has('summary') else '', + 'short_description': infobox.get('short_description').value.strip() if infobox.has('short_description') else '', + 'keywords':[kw.strip() for kw in infobox.get('keywords').value.split(',')] if infobox.has('keywords') else [], + 'link':None,'discipline':infobox.get('discipline').value.strip() if infobox.has('discipline') else '', + 'group':infobox.get('group').value.strip() if infobox.has('group') else None,'age_group':infobox.get('age_group').value.strip() if infobox.has('age_group') else '', + 'target_group':infobox.get('target_group').value.strip() if infobox.has('target_group') else '','min_participants':int(infobox.get('min_participants').value.strip()) if infobox.has('min_participants') else 1, + 'duration_minutes':int(infobox.get('duration').value.strip()) if infobox.has('duration') else 0,'capabilities':{},'category':category, + 'purpose':'','execution':'','notes':'','preparation':'','method':'','equipment':[] } + resp = requests.post(f"{os.getenv('APP_URL','http://localhost:8000')}/exercise", json=ex) + if resp.status_code==200: + imported.append(resp.json().get('id')) + return {"imported":imported} diff --git a/llm-api/wiki_router1.1.6.py b/llm-api/wiki_router1.1.6.py new file mode 100644 index 0000000..4a66dce --- /dev/null +++ b/llm-api/wiki_router1.1.6.py @@ -0,0 +1,164 @@ +""" +File: wiki_router.py +Beschreibung: +- Enthält Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /pages: Listet Seiten einer Kategorie (Artikel im Namespace 0). + * /pagecontent: Ruft Wikitext einer Seite ab. + * /semantic/pages: Führt SMW-Ask-Abfrage aus. + * /import/exercises: Importiert Übungen per Infobox-Parsing. +Version: 1.1.6 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List, Dict +import requests, os + +# Version hochgezählt +__version__ = "1.1.6" + +router = APIRouter() + +# MediaWiki-Konfiguration +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") +wiki_session = requests.Session() + +# Models +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class CategoryMembersResponse(BaseModel): + pageid: int + title: str + +class PageContentResponse(BaseModel): + pageid: int + title: str + content: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, timeout=5) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # clientlogin Token holen + try: + token_resp = wiki_session.get(WIKI_API_URL, params={"action":"query","meta":"tokens","type":"login","format":"json"}, timeout=10) + token_resp.raise_for_status() + token = token_resp.json().get("query",{}).get("tokens",{}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post(WIKI_API_URL, data={"action":"clientlogin","format":"json","username":data.username,"password":data.password,"logintoken":token,"loginreturnurl":"http://localhost:8000"}, timeout=10) + login_resp.raise_for_status() + cl = login_resp.json().get("clientlogin", {}) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Login-Error: {e}") + if cl.get("status") == "PASS": + return WikiLoginResponse(status="success", message=None) + # fallback action=login + try: + alt = wiki_session.post(WIKI_API_URL, data={"action":"login","format":"json","lgname":data.username,"lgpassword":data.password}, timeout=10) + alt.raise_for_status() + res = alt.json().get("login",{}) + if res.get("result") == "Success": + return WikiLoginResponse(status="success", message=None) + else: + return WikiLoginResponse(status="failed", message=res.get("reason")) + except Exception as e: + return WikiLoginResponse(status="failed", message=str(e)) + +# List category members (Namespace 0) +@router.get("/pages", response_model=List[CategoryMembersResponse]) +def list_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")): + cmtitle = f"Category:{category}" + params = {"action":"query","list":"categorymembers","cmtitle":cmtitle,"cmnamespace":0,"cmlimit":50000,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + members = r.json().get("query",{}).get("categorymembers",[]) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return [CategoryMembersResponse(pageid=m["pageid"], title=m["title"]) for m in members] + +# Fetch page content +@router.post("/pagecontent", response_model=PageContentResponse) +def get_page_content(pageid: int = Query(...), title: str = Query(None)): + params = {"action":"query","prop":"revisions","rvprop":"content","rvslots":"main","pageids":pageid,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + pages = r.json().get("query",{}).get("pages",{}) + content = pages.get(str(pageid),{}).get("revisions",[{}])[0].get("slots",{}).get("main",{}).get("*","") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Content-Error: {e}") + return PageContentResponse(pageid=pageid, title=title or pages[str(pageid)].get("title"), content=content) + +# SMW-Ask query (rekursive Abfrage aller Seiten inkl. Unterkategorien) +@router.get("/semantic/pages") +def semantic_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict: + """ + Nutzt SMW Ask mit rekursiver Kategorie-Abfrage, um alle Seiten zurückzugeben. + Limit=50000, kein spezifischer Printout, das Ergebnis-JSON enthält alle Felder. + """ + smw_query = f"[[Category:{category}]]" + ask_query = f"{smw_query}|limit=50000" + params = {"action": "ask", "query": ask_query, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=20) + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + result = r.json().get("query", {}).get("results", {}) + return result + +# Import exercises +@router.get("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises_get(category: str = Query(...)): + return import_exercises(category) + +@router.post("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises(category: str = Query(...)): + import mwparserfromhell + imported = [] + for p in list_category_members(category): + pc = get_page_content(pageid=p.pageid, title=p.title) + wikicode = mwparserfromhell.parse(pc.content) + infobox = next((t for t in wikicode.filter_templates() if t.name.strip()=="ÜbungInfoBox"),None) + if not infobox: + continue + ex = { 'title': infobox.get('title').value.strip() if infobox.has('title') else p.title, + 'summary': infobox.get('summary').value.strip() if infobox.has('summary') else '', + 'short_description': infobox.get('short_description').value.strip() if infobox.has('short_description') else '', + 'keywords':[kw.strip() for kw in infobox.get('keywords').value.split(',')] if infobox.has('keywords') else [], + 'link':None,'discipline':infobox.get('discipline').value.strip() if infobox.has('discipline') else '', + 'group':infobox.get('group').value.strip() if infobox.has('group') else None,'age_group':infobox.get('age_group').value.strip() if infobox.has('age_group') else '', + 'target_group':infobox.get('target_group').value.strip() if infobox.has('target_group') else '','min_participants':int(infobox.get('min_participants').value.strip()) if infobox.has('min_participants') else 1, + 'duration_minutes':int(infobox.get('duration').value.strip()) if infobox.has('duration') else 0,'capabilities':{},'category':category, + 'purpose':'','execution':'','notes':'','preparation':'','method':'','equipment':[] } + resp = requests.post(f"{os.getenv('APP_URL','http://localhost:8000')}/exercise", json=ex) + if resp.status_code==200: + imported.append(resp.json().get('id')) + return {"imported":imported} diff --git a/llm-api/wiki_router1.1.7.py b/llm-api/wiki_router1.1.7.py new file mode 100644 index 0000000..333e11c --- /dev/null +++ b/llm-api/wiki_router1.1.7.py @@ -0,0 +1,165 @@ +""" +File: wiki_router.py +Beschreibung: +- Enthält Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /pages: Listet Seiten einer Kategorie (Artikel im Namespace 0). + * /pagecontent: Ruft Wikitext einer Seite ab. + * /semantic/pages: Führt SMW-Ask-Abfrage aus. + * /import/exercises: Importiert Übungen per Infobox-Parsing. +Version: 1.1.6 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List, Dict +import requests, os + +# Version hochgezählt +__version__ = "1.1.6" + +router = APIRouter() + +# MediaWiki-Konfiguration +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +WIKI_BOT_USER = os.getenv("WIKI_BOT_USER", "") +WIKI_BOT_PASSWORD = os.getenv("WIKI_BOT_PASSWORD", "") +wiki_session = requests.Session() + +# Models +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class CategoryMembersResponse(BaseModel): + pageid: int + title: str + +class PageContentResponse(BaseModel): + pageid: int + title: str + content: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, timeout=5) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # clientlogin Token holen + try: + token_resp = wiki_session.get(WIKI_API_URL, params={"action":"query","meta":"tokens","type":"login","format":"json"}, timeout=10) + token_resp.raise_for_status() + token = token_resp.json().get("query",{}).get("tokens",{}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post(WIKI_API_URL, data={"action":"clientlogin","format":"json","username":data.username,"password":data.password,"logintoken":token,"loginreturnurl":"http://localhost:8000"}, timeout=10) + login_resp.raise_for_status() + cl = login_resp.json().get("clientlogin", {}) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Login-Error: {e}") + if cl.get("status") == "PASS": + return WikiLoginResponse(status="success", message=None) + # fallback action=login + try: + alt = wiki_session.post(WIKI_API_URL, data={"action":"login","format":"json","lgname":data.username,"lgpassword":data.password}, timeout=10) + alt.raise_for_status() + res = alt.json().get("login",{}) + if res.get("result") == "Success": + return WikiLoginResponse(status="success", message=None) + else: + return WikiLoginResponse(status="failed", message=res.get("reason")) + except Exception as e: + return WikiLoginResponse(status="failed", message=str(e)) + +# List category members (Namespace 0) +@router.get("/pages", response_model=List[CategoryMembersResponse]) +def list_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")): + cmtitle = f"Category:{category}" + params = {"action":"query","list":"categorymembers","cmtitle":cmtitle,"cmnamespace":0,"cmlimit":50000,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + members = r.json().get("query",{}).get("categorymembers",[]) + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return [CategoryMembersResponse(pageid=m["pageid"], title=m["title"]) for m in members] + +# Fetch page content +@router.post("/pagecontent", response_model=PageContentResponse) +def get_page_content(pageid: int = Query(...), title: str = Query(None)): + params = {"action":"query","prop":"revisions","rvprop":"content","rvslots":"main","pageids":pageid,"format":"json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + r.raise_for_status() + pages = r.json().get("query",{}).get("pages",{}) + content = pages.get(str(pageid),{}).get("revisions",[{}])[0].get("slots",{}).get("main",{}).get("*","") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Content-Error: {e}") + return PageContentResponse(pageid=pageid, title=title or pages[str(pageid)].get("title"), content=content) + +# SMW-Ask query (rekursive Abfrage aller Seiten inkl. Unterkategorien) +@router.get("/semantic/pages") +def semantic_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict: + """ + Nutzt SMW Ask mit rekursiver Kategorie-Abfrage, um alle Seiten zurückzugeben. + Limit=50000 ohne Printout, das komplette Ergebnis-Set wird geliefert. + """ + smw_query = f"[[Category:{category}]]" + # Ask-Parameter: Limit für rekursive Abfrage + ask_query = f"{smw_query}|limit=50000" + params = {"action": "ask", "query": ask_query, "format": "json"} + try: + r = wiki_session.get(WIKI_API_URL, params=params, timeout=30) + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + data = r.json().get("query", {}).get("results", {}) + return data + +# Import exercises +@router.get("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises_get(category: str = Query(...)): + return import_exercises(category) + +@router.post("/import/exercises", response_model=Dict[str,List[str]]) +def import_exercises(category: str = Query(...)): + import mwparserfromhell + imported = [] + for p in list_category_members(category): + pc = get_page_content(pageid=p.pageid, title=p.title) + wikicode = mwparserfromhell.parse(pc.content) + infobox = next((t for t in wikicode.filter_templates() if t.name.strip()=="ÜbungInfoBox"),None) + if not infobox: + continue + ex = { 'title': infobox.get('title').value.strip() if infobox.has('title') else p.title, + 'summary': infobox.get('summary').value.strip() if infobox.has('summary') else '', + 'short_description': infobox.get('short_description').value.strip() if infobox.has('short_description') else '', + 'keywords':[kw.strip() for kw in infobox.get('keywords').value.split(',')] if infobox.has('keywords') else [], + 'link':None,'discipline':infobox.get('discipline').value.strip() if infobox.has('discipline') else '', + 'group':infobox.get('group').value.strip() if infobox.has('group') else None,'age_group':infobox.get('age_group').value.strip() if infobox.has('age_group') else '', + 'target_group':infobox.get('target_group').value.strip() if infobox.has('target_group') else '','min_participants':int(infobox.get('min_participants').value.strip()) if infobox.has('min_participants') else 1, + 'duration_minutes':int(infobox.get('duration').value.strip()) if infobox.has('duration') else 0,'capabilities':{},'category':category, + 'purpose':'','execution':'','notes':'','preparation':'','method':'','equipment':[] } + resp = requests.post(f"{os.getenv('APP_URL','http://localhost:8000')}/exercise", json=ex) + if resp.status_code==200: + imported.append(resp.json().get('id')) + return {"imported":imported} diff --git a/llm-api/wiki_router1.1.9.py b/llm-api/wiki_router1.1.9.py new file mode 100644 index 0000000..a19b855 --- /dev/null +++ b/llm-api/wiki_router1.1.9.py @@ -0,0 +1,172 @@ +""" +File: wiki_router.py +Beschreibung: +- Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask. + * /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab. + * /semantic/page: Liefert Metadaten einer Übung und Wikitext über parse. +Version: 1.1.9 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import Dict, Any, List +import requests, os + +__version__ = "1.1.9" +router = APIRouter() + +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +wiki_session = requests.Session() + +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class PageContentResponse(BaseModel): + pageid: int + title: str + wikitext: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, + timeout=5 + ) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # Token holen + try: + token_resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, + timeout=10 + ) + token_resp.raise_for_status() + token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post( + WIKI_API_URL, + data={ + "action": "clientlogin", + "format": "json", + "username": data.username, + "password": data.password, + "logintoken": token, + "loginreturnurl": "http://localhost:8000" + }, + timeout=10 + ) + login_resp.raise_for_status() + status = login_resp.json().get("clientlogin", {}).get("status") + except Exception: + status = None + # fallback login + if status != "PASS": + alt = wiki_session.post( + WIKI_API_URL, + data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password}, + timeout=10 + ) + alt.raise_for_status() + status = alt.json().get("login", {}).get("result") + if status in ("PASS", "Success"): + return WikiLoginResponse(status="success", message=None) + return WikiLoginResponse(status="failed", message="Login fehlgeschlagen") + +# SMW-Ask: alle Übungen inkl. Unterkategorien +@router.get("/semantic/pages") +def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: + smw_query = f"[[Category:{category}]]" + ask_query = f"{smw_query}|limit=50000" + r = wiki_session.get( + WIKI_API_URL, + params={"action": "ask", "query": ask_query, "format": "json"}, + timeout=30 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + return r.json().get("query", {}).get("results", {}) + +# Liste direkter Category Members (für pageid fallback) +@router.get("/pages") +def list_category_members(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> List[Dict[str, Any]]: + cmtitle = f"Category:{category}" + params = {"action": "query", "list": "categorymembers", "cmtitle": cmtitle, "cmnamespace": 0, "cmlimit": 50000, "format": "json"} + r = wiki_session.get(WIKI_API_URL, params=params, timeout=10) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Kategorie-Error: {e}") + return r.json().get("query", {}).get("categorymembers", []) + +# Wikitext über parse-Endpoint holen +@router.get("/parsepage", response_model=PageContentResponse) +def parse_page(pageid: int = Query(...), title: str = Query(None)): + r = wiki_session.get( + WIKI_API_URL, + params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, + timeout=20 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") + wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") + return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) + +# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext via parse (Titel) +@router.get("/semantic/page") +def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]: + """ + Liefert Metadaten und Wikitext einer einzelnen Übung. + Nutzt SMW-Ask für Metadaten und den Parse-Endpoint per Titel für den Wikitext. + """ + # Metadaten aus SMW-Ask + entries = semantic_pages(category) + entry = entries.get(title) + if not entry: + raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.") + # Wikitext direkt über Parse-Endpoint per Titel laden + try: + r = wiki_session.get( + WIKI_API_URL, + params={"action": "parse", "page": title, "prop": "wikitext", "format": "json"}, + timeout=20 + ) + r.raise_for_status() + wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") + return { + "title": title, + "pageid": entry.get("pageid"), + "fullurl": entry.get("fullurl"), + "printouts": entry.get("printouts", {}), + "wikitext": wikitext + } diff --git a/llm-api/wiki_router1.2.0.py b/llm-api/wiki_router1.2.0.py new file mode 100644 index 0000000..8f224fc --- /dev/null +++ b/llm-api/wiki_router1.2.0.py @@ -0,0 +1,173 @@ +""" +File: wiki_router.py +Beschreibung: +- Endpunkte für MediaWiki-Integration im lokalen Netzwerk. +- Funktionen: + * /health: Prüft Verfügbarkeit der MediaWiki-API. + * /login: Führt clientlogin durch und speichert Session-Cookies. + * /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask. + * /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab. + * /info: Liefert pageid und fullurl über Core-API Query. + * /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API. +Version: 1.2.0 +""" +from dotenv import load_dotenv +load_dotenv() +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import Dict, Any, List +import requests, os + +__version__ = "1.2.0" +router = APIRouter() + +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +wiki_session = requests.Session() + +class WikiLoginRequest(BaseModel): + username: str + password: str + +class WikiLoginResponse(BaseModel): + status: str + message: str | None = None + +class PageContentResponse(BaseModel): + pageid: int + title: str + wikitext: str + +class PageInfoResponse(BaseModel): + pageid: int + title: str + fullurl: str + +# Health-Check +@router.get("/health") +def health_check(): + try: + resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, + timeout=5 + ) + resp.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") + return {"status": "ok"} + +# Login Endpoint +@router.post("/login", response_model=WikiLoginResponse) +def login(data: WikiLoginRequest): + # Token holen + try: + token_resp = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, + timeout=10 + ) + token_resp.raise_for_status() + token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Token-Error: {e}") + if not token: + raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin + try: + login_resp = wiki_session.post( + WIKI_API_URL, + data={ + "action": "clientlogin", + "format": "json", + "username": data.username, + "password": data.password, + "logintoken": token, + "loginreturnurl": "http://localhost:8000" + }, + timeout=10 + ) + login_resp.raise_for_status() + status = login_resp.json().get("clientlogin", {}).get("status") + except Exception: + status = None + # fallback login + if status != "PASS": + alt = wiki_session.post( + WIKI_API_URL, + data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password}, + timeout=10 + ) + alt.raise_for_status() + status = alt.json().get("login", {}).get("result") + if status in ("PASS", "Success"): + return WikiLoginResponse(status="success", message=None) + return WikiLoginResponse(status="failed", message="Login fehlgeschlagen") + +# SMW-Ask: alle Übungen inkl. Unterkategorien +@router.get("/semantic/pages") +def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: + smw_query = f"[[Category:{category}]]" + ask_query = f"{smw_query}|limit=50000" + r = wiki_session.get( + WIKI_API_URL, + params={"action": "ask", "query": ask_query, "format": "json"}, + timeout=30 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") + return r.json().get("query", {}).get("results", {}) + +# Wikitext über parse-Endpoint holen (per pageid) +@router.get("/parsepage", response_model=PageContentResponse) +def parse_page(pageid: int = Query(...), title: str = Query(None)): + r = wiki_session.get( + WIKI_API_URL, + params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, + timeout=20 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") + wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") + return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) + +# Pageinfo über Core-API (ermittelt pageid + fullurl) +@router.get("/info", response_model=PageInfoResponse) +def page_info(title: str = Query(..., description="Name der Seite")): + r = wiki_session.get( + WIKI_API_URL, + params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"}, + timeout=10 + ) + try: + r.raise_for_status() + except Exception as e: + raise HTTPException(status_code=502, detail=f"Info-Error: {e}") + pages = r.json().get("query", {}).get("pages", {}) + pid_str, page = next(iter(pages.items())) + pid = int(pid_str) + fullurl = page.get("fullurl") + return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl) + +# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API +@router.get("/semantic/page", response_model=Dict[str, Any]) +def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]: + # Metadaten aus SMW-Ask + entries = semantic_pages(category) + entry = entries.get(title) + if not entry: + raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.") + # Pageinfo via Core-API + info = page_info(title=title) + # Wikitext via parse + parsed = parse_page(pageid=info.pageid, title=title) + return { + "title": title, + "pageid": info.pageid, + "fullurl": info.fullurl, + "printouts": entry.get("printouts", {}), + "wikitext": parsed.wikitext + } diff --git a/scripts/archiv/chunker_utils.py b/scripts/archiv/chunker_utils.py new file mode 100644 index 0000000..df00d94 --- /dev/null +++ b/scripts/archiv/chunker_utils.py @@ -0,0 +1,34 @@ +import re + +def chunk_text_paragraphs(text: str, max_length: int = 500) -> list[str]: + # Abstze trennen + paragraphs = re.split(r'\n\s*\n', text.strip()) + chunks: list[str] = [] + current_chunk = "" + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # Passt der Absatz noch zum aktuellen Chunk? +2 fr die spter hinzugefgten "\n\n" + if len(current_chunk) + len(para) + 2 <= max_length: + current_chunk = (current_chunk + "\n\n" + para) if current_chunk else para + else: + # Aktuellen Chunk abschlieen + if current_chunk: + chunks.append(current_chunk) + + # Ist der Absatz selbst zu gro? Dann hart splitten + if len(para) > max_length: + for i in range(0, len(para), max_length): + chunks.append(para[i:i + max_length]) + current_chunk = "" + else: + current_chunk = para + + # Letzten Chunk nicht vergessen + if current_chunk: + chunks.append(current_chunk) + + return chunks diff --git a/scripts/archiv/import_documents.py b/scripts/archiv/import_documents.py new file mode 100644 index 0000000..7c86d1c --- /dev/null +++ b/scripts/archiv/import_documents.py @@ -0,0 +1,93 @@ +import os +import sys +import shutil +import requests +from tqdm import tqdm + +# 📌 Konfiguration +API_URL = "http://localhost:8000/embed" +CHUNK_SIZE = 500 +OVERLAP = 100 +BATCH_SIZE = 20 + +# 📁 Kommandozeilenparameter auswerten +if len(sys.argv) != 2: + print("❌ Bitte gib eine Kategorie an, z. B.: python index_documents_advanced.py karatetrainer") + sys.exit(1) + +CATEGORY = sys.argv[1] +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") +ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported") +COLLECTION = CATEGORY + +if not os.path.exists(SOURCE_DIR): + print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.") + sys.exit(1) + +os.makedirs(ARCHIVE_DIR, exist_ok=True) + +print(f"📁 Lese Dokumente aus: {SOURCE_DIR}") +print(f"📂 Archivierte Dateien: {ARCHIVE_DIR}") +print(f"🎯 Ziel-Collection: {COLLECTION}") + +# 🔧 Text in überlappende Chunks aufteilen +def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): + chunks = [] + start = 0 + while start < len(text): + end = min(start + size, len(text)) + chunks.append(text[start:end]) + start += size - overlap + return chunks + +# 📚 Alle .txt-Dateien im Ordner lesen +def read_all_text_files(folder): + file_chunk_map = {} # Map: filename → chunks + for filename in os.listdir(folder): + if filename.endswith(".txt"): + path = os.path.join(folder, filename) + with open(path, "r", encoding="utf-8") as f: + text = f.read() + file_chunk_map[filename] = chunk_text(text) + return file_chunk_map + +# 📤 An API senden +def embed_chunks_in_batches(chunks, collection): + results = [] + for i in tqdm(range(0, len(chunks), BATCH_SIZE), desc="📡 Embedding"): + batch = chunks[i:i + BATCH_SIZE] + response = requests.post(API_URL, json={"texts": batch, "collection": collection}) + response.raise_for_status() + results.append(response.json()) + return results + +# 🚀 Hauptlogik +if __name__ == "__main__": + file_chunk_map = read_all_text_files(SOURCE_DIR) + all_chunks = [] + processed_files = [] + + for filename, chunks in file_chunk_map.items(): + if chunks: + all_chunks.extend(chunks) + processed_files.append(filename) + + if not all_chunks: + print("⚠️ Keine Textabschnitte gefunden.") + sys.exit(0) + + print(f"📦 {len(all_chunks)} Textabschnitte aus {len(processed_files)} Dateien gefunden.") + + try: + result = embed_chunks_in_batches(all_chunks, COLLECTION) + print(f"\n✅ Embedding abgeschlossen: {len(result)} API-Antwort(en) erhalten.") + + # 🗃️ Verarbeitete Dateien archivieren + for filename in processed_files: + src = os.path.join(SOURCE_DIR, filename) + dst = os.path.join(ARCHIVE_DIR, filename) + shutil.move(src, dst) + print(f"📁 {len(processed_files)} Dateien verschoben nach _imported.") + + except Exception as e: + print(f"❌ Fehler beim Senden: {e}") diff --git a/scripts/archiv/import_textfile.py b/scripts/archiv/import_textfile.py new file mode 100644 index 0000000..a4b9700 --- /dev/null +++ b/scripts/archiv/import_textfile.py @@ -0,0 +1,61 @@ +import requests +import os +import sys + +# 📌 Konfiguration +API_URL = "http://localhost:8000/embed" +CHUNK_SIZE = 500 +OVERLAP = 100 + +def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): + """ + Teilt einen Text in überlappende Abschnitte auf. + """ + chunks = [] + start = 0 + while start < len(text): + end = min(start + size, len(text)) + chunks.append(text[start:end]) + start += size - overlap + return chunks + +def read_text_file(path): + """ + Liest den Inhalt einer Textdatei ein. + """ + with open(path, "r", encoding="utf-8") as f: + return f.read() + +def import_chunks(chunks, collection): + """ + Sendet die Textabschnitte an die API. + """ + response = requests.post(API_URL, json={ + "texts": chunks, + "collection": collection + }) + response.raise_for_status() + return response.json() + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("❌ Nutzung: python import_textfile.py ") + sys.exit(1) + + collection = sys.argv[1] + filepath = os.path.expanduser(sys.argv[2]) + + if not os.path.isfile(filepath): + print(f"❌ Datei nicht gefunden: {filepath}") + sys.exit(1) + + print(f"📄 Lade Datei: {filepath}") + text = read_text_file(filepath) + chunks = chunk_text(text) + + print(f"📦 {len(chunks)} Abschnitte vorbereitet – sende an Collection '{collection}'...") + try: + result = import_chunks(chunks, collection) + print(f"✅ Import erfolgreich: {result}") + except Exception as e: + print(f"❌ Fehler beim Import: {e}") diff --git a/scripts/archiv/import_texts.py b/scripts/archiv/import_texts.py new file mode 100644 index 0000000..9469fbc --- /dev/null +++ b/scripts/archiv/import_texts.py @@ -0,0 +1,31 @@ +import requests +import sys + +# 📌 Konfiguration +API_URL = "http://localhost:8000/embed" + +def import_text(text, collection="default"): + """ + Sendet einen einzelnen Textabschnitt an die Embed-API zur Indexierung. + """ + response = requests.post(API_URL, json={ + "texts": [text], + "collection": collection + }) + response.raise_for_status() + return response.json() + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("❌ Nutzung: python import_texts.py \"\"") + sys.exit(1) + + collection = sys.argv[1] + text = sys.argv[2] + + print(f"📤 Sende an Collection '{collection}': {text}") + try: + result = import_text(text, collection) + print(f"✅ Antwort: {result}") + except Exception as e: + print(f"❌ Fehler beim Importieren: {e}") diff --git a/scripts/archiv/index_documents.py b/scripts/archiv/index_documents.py new file mode 100644 index 0000000..6af9369 --- /dev/null +++ b/scripts/archiv/index_documents.py @@ -0,0 +1,66 @@ +import os +import sys +import requests + +# 📌 Konfiguration +API_URL = "http://localhost:8000/embed" +CHUNK_SIZE = 500 +OVERLAP = 100 + +# 📁 Kommandozeilenparameter auswerten +if len(sys.argv) != 2: + print("❌ Bitte gib eine Kategorie an, z. B.: python index_documents.py karatetrainer") + sys.exit(1) + +CATEGORY = sys.argv[1] +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") +COLLECTION = CATEGORY + +if not os.path.exists(SOURCE_DIR): + print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.") + sys.exit(1) + +print(f"📁 Lese Dokumente aus: {SOURCE_DIR}") +print(f"🎯 Ziel-Collection: {COLLECTION}") + +# 🔧 Text in überlappende Chunks aufteilen +def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): + chunks = [] + start = 0 + while start < len(text): + end = min(start + size, len(text)) + chunks.append(text[start:end]) + start += size - overlap + return chunks + +# 📚 Alle .txt-Dateien im Ordner lesen +def read_all_text_files(folder): + chunks = [] + for filename in os.listdir(folder): + if filename.endswith(".txt"): + path = os.path.join(folder, filename) + with open(path, "r", encoding="utf-8") as f: + text = f.read() + file_chunks = chunk_text(text) + chunks.extend(file_chunks) + return chunks + +# 📤 An API senden +def embed_chunks(chunks, collection): + response = requests.post(API_URL, json={"texts": chunks, "collection": collection}) + response.raise_for_status() + return response.json() + +# 🚀 Hauptlogik +if __name__ == "__main__": + texts = read_all_text_files(SOURCE_DIR) + if not texts: + print("⚠️ Keine Textabschnitte gefunden.") + sys.exit(0) + + print(f"📦 {len(texts)} Textabschnitte gefunden, sende an {API_URL}...") + try: + result = embed_chunks(texts, COLLECTION) + print(f"✅ Ergebnis: {result}") + except Exception as e: + print(f"❌ Fehler beim Senden: {e}") diff --git a/scripts/archiv/text_chunker.py b/scripts/archiv/text_chunker.py new file mode 100644 index 0000000..1a84379 --- /dev/null +++ b/scripts/archiv/text_chunker.py @@ -0,0 +1,46 @@ +import re + +def chunk_text_paragraphs(text, max_length=500, overlap=1): + """ + Zerteilt den Text absatzweise in Chunks bis `max_length` Zeichen. + Optional wird `paragraph_overlap` Absatz(e) vom vorherigen Chunk übernommen. + """ + + paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()] + chunks = [] + current_chunk = [] + current_len = 0 + + i = 0 + while i < len(paragraphs): + para = paragraphs[i] + para_len = len(para) + + # Wenn dieser Absatz den max Chunk sprengt → neuen Chunk + if current_len + para_len + 2 > max_length: # +2 für Leerzeile + if current_chunk: + chunks.append("\n\n".join(current_chunk)) + # Optional: letzte N Absätze behalten + if overlap > 0: + current_chunk = current_chunk[-overlap:] + current_len = sum(len(p) for p in current_chunk) + 2 * len(current_chunk) + else: + current_chunk = [] + current_len = 0 + else: + # Einzelner Absatz ist zu groß → hart splitten + chunks.append(para[:max_length]) + para = para[max_length:] + paragraphs.insert(i + 1, para) # Rest zurück in Liste + i += 1 + continue + else: + current_chunk.append(para) + current_len += para_len + 2 # +2 für Trennung + + i += 1 + + if current_chunk: + chunks.append("\n\n".join(current_chunk)) + + return chunks diff --git a/scripts/chunking_utils.py b/scripts/chunking_utils.py new file mode 100644 index 0000000..7dcaaad --- /dev/null +++ b/scripts/chunking_utils.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# -------------------------------------------------- +# chunking_utils.py +# +# Enthlt robuste Text-Chunking-Logik: +# 1. Absatzbasiertes Chunking +# 2. Satzbasiertes Chunking per Regex (kein NLTK) +# 3. Satz-Overlap-Chunking +# -------------------------------------------------- + +import re + +# -------------------------------------------------- +# Hilfsfunktion: split_sentences +# Zweck: +# - Teilt Text in Stze auf, basierend auf Punkt, Ausrufe- und Fragezeichen +# - Trennt bei ".!? " (Satzzeichen gefolgt von Leerraum) +# Parameter: +# text : Volltext als String +# Rckgabe: +# Liste von Satz-Strings +# -------------------------------------------------- +def split_sentences(text: str) -> list[str]: + # Regex: lookbehind fr . ! oder ?, dann ein oder mehrere Whitespace-Zeichen + return re.split(r'(?<=[\.!?])\s+', text.strip()) + + +# -------------------------------------------------- +# Funktion: chunk_text_paragraphs +# Zweck: +# - Trennt Text absatzweise in Chunks mit bis zu max_length Zeichen +# - Abstze werden an doppelten Zeilenumbrchen getrennt +# - Zu groe Abstze werden hart in max_length-Teile gesplittet +# Parameter: +# text : Volltext als String +# max_length : Maximale Lnge eines Chunks (Standard 500) +# Rckgabe: +# Liste von Strings (Chunks) +# -------------------------------------------------- +def chunk_text_paragraphs(text: str, max_length: int = 500) -> list[str]: + paragraphs = re.split(r'\n\s*\n', text.strip()) + chunks: list[str] = [] + current_chunk = "" + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # Prfen, ob Absatz noch in aktuellen Chunk passt (+2 fr "\n\n") + if len(current_chunk) + len(para) + 2 <= max_length: + if current_chunk: + current_chunk += "\n\n" + para + else: + current_chunk = para + else: + # Bislang gesammelten Chunk speichern + if current_chunk: + chunks.append(current_chunk) + + # Absatz hart splitten, wenn er allein zu gro ist + if len(para) > max_length: + for i in range(0, len(para), max_length): + part = para[i:i + max_length] + chunks.append(part) + current_chunk = "" + else: + # Neuer Chunk beginnt mit diesem Absatz + current_chunk = para + + # Letzten Chunk nicht vergessen + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +# -------------------------------------------------- +# Funktion: chunk_by_sentences +# Zweck: +# - Zerlegt Text in Stze per Regex-Split +# - Baut daraus Chunks mit ganzen Stzen bis max_length +# Parameter: +# text : Volltext als String +# max_length : Maximale Lnge eines Chunks (Standard 500) +# Rckgabe: +# Liste von Strings (Chunks) +# -------------------------------------------------- +def chunk_by_sentences(text: str, max_length: int = 500) -> list[str]: + sentences = split_sentences(text) + chunks: list[str] = [] + current_chunk = "" + + for sent in sentences: + sent = sent.strip() + if not sent: + continue + + # Prfen, ob Satz noch in aktuellen Chunk passt (+1 fr Leerzeichen) + if len(current_chunk) + len(sent) + 1 <= max_length: + if current_chunk: + current_chunk += " " + sent + else: + current_chunk = sent + else: + # Bisher gesammelten Chunk speichern + if current_chunk: + chunks.append(current_chunk) + + # Einzelnen Satz hart splitten, falls er zu lang ist + if len(sent) > max_length: + for i in range(0, len(sent), max_length): + chunks.append(sent[i:i + max_length]) + current_chunk = "" + else: + current_chunk = sent + + # Letzten Chunk nicht vergessen + if current_chunk: + chunks.append(current_chunk) + + return chunks + + +# -------------------------------------------------- +# Funktion: chunk_with_sentence_overlap +# Zweck: +# - Baut zunchst sentence-basierte Chunks +# - Fgt vom vorherigen Chunk overlap_sents Stze vorne an +# Parameter: +# text : Volltext als String +# max_length : Maximale Lnge eines Chunks (Standard 500) +# overlap_sents : Anzahl Stze, die berlappend bernommen werden (Standard 1) +# Rckgabe: +# Liste von Strings (Chunks mit Kontext-Overlap) +# -------------------------------------------------- +def chunk_with_sentence_overlap( + text: str, + max_length: int = 500, + overlap_sents: int = 1 +) -> list[str]: + base_chunks = chunk_by_sentences(text, max_length) + overlapped: list[str] = [] + + for idx, chunk in enumerate(base_chunks): + if idx == 0 or overlap_sents <= 0: + overlapped.append(chunk) + else: + prev = base_chunks[idx - 1] + prev_sents = split_sentences(prev) + context = " ".join(prev_sents[-overlap_sents:]) + overlapped.append((context + " " + chunk).strip()) + + return overlapped diff --git a/scripts/import_folder_txt.py b/scripts/import_folder_txt.py new file mode 100644 index 0000000..241da99 --- /dev/null +++ b/scripts/import_folder_txt.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os +import sys +import subprocess +from tqdm import tqdm + +def main(): + import argparse + parser = argparse.ArgumentParser( + description="Importiert alle .txt-Dateien in einer Kategorie mittels import_single_file.py" + ) + parser.add_argument( + "category", + help="Name der Kategorie (Ordner unter ~/knowledge/)" + ) + parser.add_argument( + "--topic", "-t", + help="Optionaler Kontext/Topic für alle Importe", + default="default" + ) + parser.add_argument( + "--script-path", "-s", + help="Pfad zum import_single_file.py", + default="import_single_file.py" + ) + args = parser.parse_args() + + category = args.category + topic = args.topic + script_path = args.script_path + + base_dir = os.path.expanduser(f"~/knowledge/{category}") + if not os.path.isdir(base_dir): + print(f"❌ Kategorie-Ordner nicht gefunden: {base_dir}") + sys.exit(1) + + files = [f for f in os.listdir(base_dir) if f.endswith(".txt")] + if not files: + print(f"⚠️ Keine .txt-Dateien in {base_dir} gefunden.") + sys.exit(0) + + print(f"📁 Starte Ordner-Import: {len(files)} Dateien in Kategorie '{category}' (Topic: '{topic}')") + for filename in tqdm(files, desc="Importiere Dateien"): + cmd = [ + sys.executable, + script_path, + category, + filename, + topic + ] + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"❌ Fehler beim Import von {filename}: {e}") + + print("✅ Ordner-Import abgeschlossen.") + +if __name__ == "__main__": + main() + diff --git a/scripts/import_pdf_file.py b/scripts/import_pdf_file.py new file mode 100644 index 0000000..d8f3f48 --- /dev/null +++ b/scripts/import_pdf_file.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +import_pdf_file.py + +Importiert eine einzelne PDF-Datei: +- Text-Extraktion via pdfplumber +- Chunking mit Satz-Overlap +- POST an /embed +- Verschieben nach _imported +""" + +import os +import sys +import shutil +import requests +from uuid import uuid4 +from datetime import datetime, timezone + +import pdfplumber +from chunking_utils import chunk_with_sentence_overlap + +# Konfiguration +API_URL = "http://localhost:8000/embed" +MAX_LENGTH = 500 + +if len(sys.argv) < 3: + print("? Usage: python import_pdf_file.py [topic]") + sys.exit(1) + +category = sys.argv[1] +filename = sys.argv[2] +topic = sys.argv[3] if len(sys.argv) > 3 else "default" + +source_dir = os.path.expanduser(f"~/knowledge/{category}") +file_path = os.path.join(source_dir, filename) +archive_dir= os.path.join(source_dir, "_imported") + +if not os.path.isfile(file_path): + print(f"? Datei nicht gefunden: {file_path}") + sys.exit(1) + +os.makedirs(archive_dir, exist_ok=True) +print(f"?? PDF-Import: {filename} in Kategorie {category} (Topic={topic})") + +# 1) PDF-Text extrahieren +text = "" +with pdfplumber.open(file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() or "" + text += page_text + "\n\n" + +# 2) Chunking +chunks = chunk_with_sentence_overlap( + text, + max_length=MAX_LENGTH, + overlap_sents=2 +) +print(f"?? {len(chunks)} Chunks erzeugt.") + +# 3) Payload bauen +now = datetime.now(timezone.utc).isoformat() +payload = {"collection": category, "chunks": []} + +for idx, chunk in enumerate(chunks): + payload["chunks"].append({ + "text": chunk, + "source": filename, + "source_type": "pdf", + "title": os.path.splitext(filename)[0], + "version": "v1.0", + "related_to": category, + "tags": [category], + "owner": "karate-agent", + "context_tag": topic, + "imported_at": now, + "chunk_index": idx, + "category": category + }) + +# 4) Senden +try: + res = requests.post(API_URL, json=payload) + res.raise_for_status() + print(f"? {len(chunks)} Chunks erfolgreich eingebettet.") +except Exception as e: + print(f"? Fehler beim Senden: {e}") + sys.exit(1) + +# 5) Archivieren +shutil.move(file_path, os.path.join(archive_dir, filename)) +print("?? PDF verschoben nach _imported/") diff --git a/scripts/import_single_file.py b/scripts/import_single_file.py new file mode 100644 index 0000000..d021462 --- /dev/null +++ b/scripts/import_single_file.py @@ -0,0 +1,85 @@ +import os +import sys +import shutil +import requests +from datetime import datetime, timezone +#from text_chunker import chunk_text_paragraphs +from uuid import uuid4 +from chunking_utils import ( + chunk_text_paragraphs, + chunk_by_sentences, + chunk_with_sentence_overlap +) + + +# ?? Konfiguration +API_URL = "http://localhost:8000/embed" +CHUNK_SIZE = 500 +OVERLAP = 100 + +# ?? Eingabeparameter: Kategorie, Dateiname, optionale Metadaten +if len(sys.argv) < 3: + print("? Aufruf: python import_single_file.py [topic]") + sys.exit(1) + +CATEGORY = sys.argv[1] +FILENAME = sys.argv[2] +TOPIC = sys.argv[3] if len(sys.argv) > 3 else None + +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") +ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported") +FILEPATH = os.path.join(SOURCE_DIR, FILENAME) + +# ?? Validierung +if not os.path.exists(FILEPATH): + print(f"? Datei nicht gefunden: {FILEPATH}") + sys.exit(1) + +os.makedirs(ARCHIVE_DIR, exist_ok=True) + +print(f"?? Importiere Datei: {FILENAME} aus Kategorie: {CATEGORY}") + +# ?? Inhalte lesen und in Chunks zerteilen +with open(FILEPATH, "r", encoding="utf-8") as f: + content = f.read() + +chunks = chunk_with_sentence_overlap(content, max_length=CHUNK_SIZE, overlap_sents=2) +print(f"?? {len(chunks)} Textabschnitte erzeugt.") + +# ?? Metadaten vorbereiten +now = datetime.now(timezone.utc).isoformat() + + +payload = { + "chunks": [], + "collection": CATEGORY +} + +for i, chunk in enumerate(chunks): + payload["chunks"].append({ + "text": chunk, + "source": FILENAME, + "source_type": "file", + "title": FILENAME.replace(".txt", ""), + "version": "v1.0", + "related_to": CATEGORY, + "tags": [CATEGORY], + "owner": "karate-agent", + "context_tag": TOPIC or "default", + "imported_at": now, + "chunk_index": i, + "category": CATEGORY + }) + +# ?? An API senden +try: + res = requests.post(API_URL, json=payload) + res.raise_for_status() + print(f"? {len(chunks)} Abschnitte erfolgreich eingebettet.") +except Exception as e: + print(f"? Fehler beim Senden: {e}") + sys.exit(1) + +# ??? Datei archivieren +shutil.move(FILEPATH, os.path.join(ARCHIVE_DIR, FILENAME)) +print(f"?? Datei nach _imported verschoben.") diff --git a/scripts/import_txtdocuments.py b/scripts/import_txtdocuments.py new file mode 100644 index 0000000..46a9daf --- /dev/null +++ b/scripts/import_txtdocuments.py @@ -0,0 +1,135 @@ +import os +import sys +import shutil +import requests +import re +from tqdm import tqdm +from datetime import datetime + +# 📌 Konfiguration +API_URL = "http://localhost:8000/embed" +CHUNK_SIZE = 500 +OVERLAP = 100 +BATCH_SIZE = 20 + +# 📁 Kommandozeilenparameter auswerten +if len(sys.argv) != 2: + print("❌ Bitte gib eine Kategorie an, z. B.: python import_txtdocuments.py karatetrainer") + sys.exit(1) + +CATEGORY = sys.argv[1] +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") +ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported") +COLLECTION = CATEGORY + +if not os.path.exists(SOURCE_DIR): + print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.") + sys.exit(1) + +os.makedirs(ARCHIVE_DIR, exist_ok=True) + +print(f"📁 Lese Dokumente aus: {SOURCE_DIR}") +print(f"📂 Archivierte Dateien: {ARCHIVE_DIR}") +print(f"🎯 Ziel-Collection: {COLLECTION}") + +# 🔧 Text in überlappende Chunks aufteilen +#def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): +# chunks = [] +# start = 0 +# while start < len(text): +# end = min(start + size, len(text)) +# chunks.append(text[start:end]) +# start += size - overlap +# return chunks + +def chunk_text_paragraphs(text, max_length=500): + paragraphs = re.split(r'\n\s*\n', text.strip()) # Absatztrennung + chunks = [] + current_chunk = "" + + for para in paragraphs: + if len(current_chunk) + len(para) + 2 <= max_length: + current_chunk += para + "\n\n" + else: + if current_chunk: + chunks.append(current_chunk.strip()) + # Falls einzelner Absatz zu groß ist, hart splitten + if len(para) > max_length: + for i in range(0, len(para), max_length): + chunks.append(para[i:i+max_length].strip()) + current_chunk = "" + else: + current_chunk = para + "\n\n" + + if current_chunk: + chunks.append(current_chunk.strip()) + return chunks + + +# 📚 Alle .txt-Dateien im Ordner lesen +def read_all_text_files(folder): + file_chunk_map = {} # Map: filename → chunks + for filename in os.listdir(folder): + if filename.endswith(".txt"): + path = os.path.join(folder, filename) + with open(path, "r", encoding="utf-8") as f: + text = f.read() + file_chunk_map[filename] = chunk_text_paragraphs(text) + return file_chunk_map + +# 🧱 Strukturierte Payloads vorbereiten +def prepare_payloads(file_chunk_map, collection): + payloads = [] + imported_at = datetime.now().isoformat() + + for filename, chunks in file_chunk_map.items(): + for local_index, chunk in enumerate(chunks): + payload = { + "text": chunk, + "source": filename, + "type": "file", + "category": collection, + "imported_at": imported_at, + "chunk_index": local_index + } + payloads.append(payload) + for p in payloads: + print(f"{p['source']}: chunk_index={p['chunk_index']}") + + return payloads + +# 📤 An API senden +def embed_chunks_in_batches(payloads, collection): + results = [] + for i in tqdm(range(0, len(payloads), BATCH_SIZE), desc="📡 Embedding"): + batch = payloads[i:i + BATCH_SIZE] + response = requests.post(API_URL, json={"chunks": batch, "collection": collection}) + response.raise_for_status() + results.append(response.json()) + return results + +# 🚀 Hauptlogik +if __name__ == "__main__": + file_chunk_map = read_all_text_files(SOURCE_DIR) + processed_files = list(file_chunk_map.keys()) + payloads = prepare_payloads(file_chunk_map, COLLECTION) + + if not payloads: + print("⚠️ Keine Textabschnitte gefunden.") + sys.exit(0) + + print(f"📦 {len(payloads)} Textabschnitte aus {len(processed_files)} Dateien gefunden. Sende an API...") + + try: + result = embed_chunks_in_batches(payloads, COLLECTION) + print(f"\n✅ Embedding abgeschlossen: {len(result)} API-Antwort(en) erhalten.") + + # 🗃️ Verarbeitete Dateien archivieren + for filename in processed_files: + src = os.path.join(SOURCE_DIR, filename) + dst = os.path.join(ARCHIVE_DIR, filename) + shutil.move(src, dst) + print(f"📁 {len(processed_files)} Dateien verschoben nach _imported.") + + except Exception as e: + print(f"❌ Fehler beim Senden: {e}") diff --git a/scripts/prompt_documents.py b/scripts/prompt_documents.py new file mode 100644 index 0000000..43a8883 --- /dev/null +++ b/scripts/prompt_documents.py @@ -0,0 +1,32 @@ +import sys +import requests + +API_URL = "http://localhost:8000/prompt" + +if len(sys.argv) < 3: + print("❌ Verwendung: python prompt_documents.py ") + sys.exit(1) + +collection = sys.argv[1] +query = " ".join(sys.argv[2:]) + +data = { + "query": query, + "collection": collection, + "context_limit": 3 +} + +print(f"🤖 Anfrage an LLM aus Collection '{collection}': {query}") +try: + response = requests.post(API_URL, json=data) + response.raise_for_status() + result = response.json() +except Exception as e: + print(f"❌ Fehler bei der Anfrage: {e}") + sys.exit(1) + +print("\n📄 Kontext:") +print(result["context"]) + +print("\n💡 Antwort:") +print(result["answer"]) diff --git a/scripts/restore_imported_files.py b/scripts/restore_imported_files.py new file mode 100644 index 0000000..1a6342a --- /dev/null +++ b/scripts/restore_imported_files.py @@ -0,0 +1,45 @@ +import os +import sys +import shutil + +def print_usage(): + print("❌ Bitte gib eine Kategorie an, z. B.:") + print(" python restore_imported_files.py karatetrainer") + sys.exit(1) + +# --- Eingabe prüfen --- +if len(sys.argv) < 2: + print_usage() + +CATEGORY = sys.argv[1] +FORCE = "--force" in sys.argv + +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}/_imported") +TARGET_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") + +if not os.path.isdir(SOURCE_DIR): + print(f"❌ Quellordner '{SOURCE_DIR}' existiert nicht.") + sys.exit(1) + +files = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))] +if not files: + print("⚠️ Keine Dateien zum Wiederherstellen gefunden.") + sys.exit(0) + +print(f"♻️ Wiederherstellung von {len(files)} Dateien nach '{TARGET_DIR}'") + +restored = 0 +for file in files: + source_path = os.path.join(SOURCE_DIR, file) + target_path = os.path.join(TARGET_DIR, file) + + if not FORCE: + confirm = input(f"🔁 Datei '{file}' zurückkopieren? [j/N] ").strip().lower() + if confirm != "j": + continue + + shutil.move(source_path, target_path) + print(f"✅ '{file}' wurde zurückkopiert.") + restored += 1 + +print(f"\n🎉 {restored} Datei(en) wurden erfolgreich wiederhergestellt.") diff --git a/scripts/restore_single_file.py b/scripts/restore_single_file.py new file mode 100644 index 0000000..7218385 --- /dev/null +++ b/scripts/restore_single_file.py @@ -0,0 +1,36 @@ +import os +import sys +import shutil + +def print_usage(): + print("❌ Bitte gib Kategorie und Dateinamen an, z. B.:") + print(" python restore_single_file.py karatetrainer mae_geri.txt") + sys.exit(1) + +# Eingabe prüfen +if len(sys.argv) != 3: + print_usage() + +CATEGORY = sys.argv[1] +FILENAME = sys.argv[2] + +SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}/_imported") +TARGET_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") +SOURCE_FILE = os.path.join(SOURCE_DIR, FILENAME) +TARGET_FILE = os.path.join(TARGET_DIR, FILENAME) + +if not os.path.isfile(SOURCE_FILE): + print(f"❌ Die Datei '{FILENAME}' wurde im Archivordner '{SOURCE_DIR}' nicht gefunden.") + sys.exit(1) + +if os.path.exists(TARGET_FILE): + confirm = input(f"⚠️ Datei '{FILENAME}' existiert im Zielordner. Überschreiben? [j/N] ").strip().lower() + if confirm != "j": + print("⏹️ Abgebrochen.") + sys.exit(0) + +try: + shutil.move(SOURCE_FILE, TARGET_FILE) + print(f"✅ '{FILENAME}' wurde zurück nach '{TARGET_DIR}' verschoben.") +except Exception as e: + print(f"❌ Fehler beim Verschieben: {e}") diff --git a/scripts/search_documents.py b/scripts/search_documents.py new file mode 100644 index 0000000..30af22c --- /dev/null +++ b/scripts/search_documents.py @@ -0,0 +1,31 @@ +import sys +import requests + +API_URL = "http://localhost:8000/search" + +if len(sys.argv) < 3: + print("❌ Verwendung: python search_documents.py ") + sys.exit(1) + +collection = sys.argv[1] +query = " ".join(sys.argv[2:]) + +params = { + "query": query, + "collection": collection, + "limit": 5 +} + +print(f"🔍 Suche in Collection '{collection}': {query}") +try: + response = requests.get(API_URL, params=params) + response.raise_for_status() + results = response.json() +except Exception as e: + print(f"❌ Fehler bei der Anfrage: {e}") + sys.exit(1) + +print("📚 Ergebnisse:") +for r in results: + print(f"\n✅ Score: {r['score']:.4f}") + print(f"{r['text']}") diff --git a/scripts/wiki_importer.py b/scripts/wiki_importer.py new file mode 100644 index 0000000..7b81ec8 --- /dev/null +++ b/scripts/wiki_importer.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Module: wiki_importer.py +Beschreibung: +- Importiert Übungen aus dem MediaWiki via FastAPI wiki_router +- Führt vor dem Import einen Login gegen /import/wiki/login durch (falls nicht via --skip-login deaktiviert) +- Holt Liste aller Übungs-Titel (SMW-Ask) via `/semantic/pages` +- Für jede Übung: + * Fetch pageinfo (pageid, fullurl) via `/info` + * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) via `/parsepage` + * Baut Payload entsprechend Exercise-Datenmodell + * POST an `/exercise` Endpoint (exercise_router) +- Unterstützt Single-Import via `--title` (oder ENV `WIKI_EXERCISE_TITLE`) und Full-Import via `--all` +- Optional: Credentials via CLI (--username/--password) oder `.env` (WIKI_BOT_USER / WIKI_BOT_PASSWORD) + +Version: 2.1.0 +""" + +import os +import sys +import argparse +from typing import Dict, Any +import requests +import mwparserfromhell +from dotenv import load_dotenv + +# ----- Konfiguration / Defaults ----- +load_dotenv() # .env laden, falls vorhanden + +API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy +EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint +DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") +DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") + +# ---- Hilfsfunktionen für Wiki-Router ---- +def wiki_health() -> None: + r = requests.get(f"{API_BASE_URL}/health", timeout=15) + r.raise_for_status() + print("[Sanity] Wiki health OK") + +def wiki_login(username: str, password: str) -> None: + """ + Führt einen Login gegen den wiki_router durch. + Erwartet: {"status":"success"} bei Erfolg. + """ + payload = {"username": username, "password": password} + r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30) + # kein raise_for_status(), wir wollen die JSON-Fehler sauber ausgeben + try: + data = r.json() + except Exception: + print(f"[Login] HTTP {r.status_code}: {r.text}") + r.raise_for_status() + + status = (data or {}).get("status") + if status != "success": + msg = (data or {}).get("message", "Login fehlgeschlagen") + raise RuntimeError(f"[Login] {msg}") + print("[Login] success") + +def fetch_all_pages(category: str) -> Dict[str, Any]: + resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=60) + resp.raise_for_status() + return resp.json() + +def fetch_page_info(title: str) -> Dict[str, Any]: + r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30) + r.raise_for_status() + info = r.json() + return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} + +def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: + print(f"[Parse] Lade '{title}' (ID={pageid})") + resp = requests.get( + f"{API_BASE_URL}/parsepage", + params={"pageid": pageid, "title": title}, + timeout=60 + ) + resp.raise_for_status() + wikitext = resp.json().get("wikitext", "") + wikicode = mwparserfromhell.parse(wikitext) + + raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} + for tpl in wikicode.filter_templates(): + name = str(tpl.name).strip() + if name == "ÜbungInfoBox": + for p in tpl.params: + raw[str(p.name).strip()] = str(p.value).strip() + elif name == "Übungsbeschreibung": + for p in tpl.params: + raw[str(p.name).strip()] = str(p.value).strip() + elif name == "SkillDevelopment": + raw.setdefault("capabilities", []) + try: + cap = str(tpl.get("PrimaryCapability").value).strip() + except Exception: + cap = "" + try: + lvl = int(str(tpl.get("CapabilityLevel").value).strip()) + except Exception: + lvl = 0 + if cap: + raw["capabilities"].append({"capability": cap, "level": lvl}) + raw["wikitext"] = wikitext + return raw + +def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]: + # Exercise.capabilities erwartet Dict[str,int] + caps_list = raw.get("capabilities", []) + capabilities = {} + for c in caps_list: + cap = c.get("capability") + lvl = c.get("level") + if isinstance(cap, str) and cap: + try: + capabilities[cap] = int(lvl) + except Exception: + pass + + # Defaults/Fallbacks + duration = 0.0 + try: + duration = float(raw.get("Dauer", 0) or 0) + except Exception: + duration = 0.0 + + keywords = [] + kw_raw = raw.get("Schlüsselworte", "") + if isinstance(kw_raw, str): + keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] + + equipment = [] + eq_raw = raw.get("equipment", []) + if isinstance(eq_raw, str): + equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] + elif isinstance(eq_raw, list): + equipment = [str(e).strip() for e in eq_raw if str(e).strip()] + + payload: Dict[str, Any] = { + "title": raw.get("title") or "", + "summary": raw.get("Summary", "") or "", + "short_description": raw.get("Summary", "") or "", + "keywords": keywords, + "link": fullurl or "", + "discipline": raw.get("Übungstyp", "") or "", + "group": raw.get("Gruppengröße", "") or None, + "age_group": raw.get("Altersgruppe", "") or "", + "target_group": raw.get("Zielgruppe", "") or "", + "min_participants": 1, + "duration_minutes": int(round(duration)), # Exercise erwartet int + "capabilities": capabilities, + "category": category or "", + "purpose": raw.get("Ziel", "") or "", + "execution": raw.get("Durchführung", "") or "", + "notes": raw.get("Hinweise", "") or "", + "preparation": raw.get("RefMethode", "") or "", + "method": raw.get("method", "") or "", # falls im Wikitext vorhanden + "equipment": equipment, + "fullurl": fullurl or "", # optionales Feld + # Idempotenz (optional nutzbar in exercise_router): + "external_id": f"wiki:{raw.get('pageid')}", + "source": "MediaWiki" + } + return payload + +def ingest_exercise(payload: Dict[str, Any]) -> None: + title = payload.get("title", "") + resp = requests.post(EXERCISE_API, json=payload, timeout=60) + if resp.status_code == 422: + print(f"[Ingest] '{title}' -> FAILED 422:\n{resp.text}") + try: + resp.raise_for_status() + except Exception: + pass + return + resp.raise_for_status() + print(f"[Ingest] '{title}' -> OK") + +# ----- Main ----- +def main() -> None: + parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant (via FastAPI wiki_router)") + parser.add_argument("--all", action="store_true", help="Alle Übungen importieren (SMW-Ask)") + parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Einzelimport eines Übungstitels") + parser.add_argument("--category", type=str, default=DEFAULT_CAT, help="Wiki-Kategorie (z.B. 'Übungen')") + parser.add_argument("--username", type=str, default=os.getenv("WIKI_BOT_USER"), help="Wiki-Login Benutzer (überschreibt .env)") + parser.add_argument("--password", type=str, default=os.getenv("WIKI_BOT_PASSWORD"), help="Wiki-Login Passwort (überschreibt .env)") + parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)") + + args = parser.parse_args() + + # Sanity + wiki_health() + + # Login (sofern nicht explizit übersprungen) + if not args.skip_login: + if not args.username or not args.password: + print("[Login] Fehler: fehlende Credentials. Setze .env (WIKI_BOT_USER/WIKI_BOT_PASSWORD) oder CLI --username/--password.", file=sys.stderr) + sys.exit(1) + try: + wiki_login(args.username, args.password) + except Exception as e: + print(str(e), file=sys.stderr) + sys.exit(1) + + # Einzel- oder Vollimport + if args.all: + print(f"[Main] Lade Liste der Übungen aus Kategorie '{args.category}'…") + pages = fetch_all_pages(args.category) + print(f"[Main] {len(pages)} Seiten gefunden.") + for title, entry in pages.items(): + pid = entry.get("pageid") + fullurl = entry.get("fullurl") + if not pid: + # Core-Info nachschlagen + info = fetch_page_info(title) + pid = info.get("pageid") + fullurl = fullurl or info.get("fullurl") + if not pid: + print(f"[Skip] '{title}' hat keine pageid") + continue + raw = parse_exercise(title, pid) + payload = build_payload(raw, fullurl or "", args.category) + ingest_exercise(payload) + else: + print(f"[Main] Import single exercise: {args.title}") + info = fetch_page_info(args.title) + pid = info.get("pageid") + fullurl = info.get("fullurl") or "" + if not pid: + print(f"[Error] pageid für '{args.title}' nicht gefunden.", file=sys.stderr) + sys.exit(1) + raw = parse_exercise(args.title, pid) + payload = build_payload(raw, fullurl, args.category) + ingest_exercise(payload) + +if __name__ == "__main__": + main() diff --git a/scripts/wiki_importer1.1.0.py b/scripts/wiki_importer1.1.0.py new file mode 100644 index 0000000..06661ed --- /dev/null +++ b/scripts/wiki_importer1.1.0.py @@ -0,0 +1,117 @@ +""" +Module: wiki_importer.py +Beschreibung: +- Importiert zunächst nur eine Übung aus dem Wiki +- Liest Wikitext einer Übung aus +- Parsen mit mwparserfromhell +- Extrahiert Felder aus Templates: + * ÜbungInfoBox + * Übungsbeschreibung + * SkillDevelopment (mehrfach) +- Baut ein Exercise-Objekt zusammen +- Speichert per POST /exercise Endpoint in Qdrant +- Detailliertes Error-Logging für 422 und allgemeine Fehler +Version: 1.1.2 +""" +import requests +import mwparserfromhell +import os +import sys +from typing import Dict, Any + +# Konfiguration über Umgebungsvariablen +API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") +EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") +# Übungstitel, der importiert werden soll +TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") + +# Helper: Holt pageid und fullurl per Core-API +def fetch_page_info(title: str) -> Dict[str, Any]: + r = requests.get(f"{API_BASE_URL}/info", params={"title": title}) + r.raise_for_status() + info = r.json() + return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} + +# Parser: Lädt und parst eine Übung +def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: + print(f"[Parse] Loading '{title}' (ID={pageid})") + resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title}) + resp.raise_for_status() + wikitext = resp.json().get("wikitext", "") + + wikicode = mwparserfromhell.parse(wikitext) + data: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} + + for tpl in wikicode.filter_templates(): + name = tpl.name.strip() + if name == "ÜbungInfoBox": + for param in tpl.params: + data[param.name.strip()] = str(param.value).strip() + elif name == "Übungsbeschreibung": + for param in tpl.params: + data[param.name.strip()] = str(param.value).strip() + elif name == "SkillDevelopment": + data.setdefault("capabilities", []) + primary = str(tpl.get("PrimaryCapability").value).strip() + level = int(str(tpl.get("CapabilityLevel").value).strip()) + data["capabilities"].append({"capability": primary, "level": level}) + data["wikitext"] = wikitext + return data + +# Ingestion: Sendet einen Datensatz an Qdrant mit detailliertem Error-Logging +def ingest_exercise(ex_data: Dict[str, Any]) -> None: + title = ex_data.get("title") + try: + resp = requests.post(EXERCISE_API, json=ex_data) + if resp.status_code == 422: + print(f"[Ingest] '{title}' -> FAILED 422:") + print(resp.text) + resp.raise_for_status() + print(f"[Ingest] '{title}' -> OK") + except requests.HTTPError as e: + msg = resp.text if 'resp' in locals() else str(e) + print(f"[Ingest] '{title}' -> HTTPError: {e} - {msg}") + except Exception as e: + print(f"[Ingest] '{title}' -> FAILED: {e}") + +# Main: Einmaliger Import für TITLE +if __name__ == "__main__": + print(f"[Main] Import single exercise: {TITLE}") + try: + info = fetch_page_info(TITLE) + pageid = info.get("pageid") + fullurl = info.get("fullurl") + if not pageid: + print(f"Error: pageid für '{TITLE}' nicht gefunden.") + sys.exit(1) + raw = parse_exercise(TITLE, pageid) + # capabilities als Dict wandeln + caps_list = raw.get("capabilities", []) + capabilities = {c["capability"]: c["level"] for c in caps_list} + # Payload entsprechend Datenmodell + exercise_payload = { + "title": raw.get("title"), + "summary": raw.get("Summary", ""), + "short_description": raw.get("Summary", ""), + "keywords": raw.get("Schlüsselworte", "").split(', '), + "link": fullurl, + "discipline": raw.get("Übungstyp", ""), + "group": raw.get("Gruppengröße", ""), + "age_group": raw.get("Altersgruppe", ""), + "target_group": raw.get("Zielgruppe", ""), + "min_participants": 1, + "duration_minutes": float(raw.get("Dauer", 0)), + "capabilities": capabilities, + "category": raw.get("category", "Übungen"), + "purpose": raw.get("Ziel", ""), + "execution": raw.get("Durchführung", ""), + "notes": raw.get("Hinweise", ""), + "preparation": raw.get("RefMethode", ""), + "method": raw.get("method", ""), + "equipment": raw.get("equipment", []), + "fullurl": fullurl + } + ingest_exercise(exercise_payload) + except Exception as e: + print(f"Fatal error: {e}") + sys.exit(1) diff --git a/scripts/wiki_importer2.0.0.py b/scripts/wiki_importer2.0.0.py new file mode 100644 index 0000000..22747fd --- /dev/null +++ b/scripts/wiki_importer2.0.0.py @@ -0,0 +1,136 @@ +""" +Module: wiki_importer.py +Beschreibung: +- Importiert alle Übungen aus dem Wiki +- Holt Liste aller Übungs-Titel und pageids via `/semantic/pages` +- Für jede Übung: + * Fetch pageinfo (pageid, fullurl) + * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) + * Baut Payload entsprechend Datenmodell + * POST an `/exercise` Endpoint +- Unterstützt Single-Import via Umgebungsvariable `WIKI_EXERCISE_TITLE` und Full-Import via `--all` +Version: 2.0.0 +""" +import requests +import mwparserfromhell +import os +import sys +from typing import Dict, Any, List +import argparse + +# Konfiguration +API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") +EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") +DEFAULT_CATEGORY = os.getenv("WIKI_CATEGORY", "Übungen") +DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") + +# Helper: Holt Liste aller Übungen (Titel-> entry) +def fetch_all_pages(category: str) -> Dict[str, Any]: + resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}) + resp.raise_for_status() + return resp.json() + +# Helper: Holt pageid und fullurl per Core-API +def fetch_page_info(title: str) -> Dict[str, Any]: + r = requests.get(f"{API_BASE_URL}/info", params={"title": title}) + r.raise_for_status() + info = r.json() + return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} + +# Parser: Lädt und parst eine Übung +def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: + print(f"[Parse] {title} (ID={pageid})") + resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title}) + resp.raise_for_status() + wikitext = resp.json().get("wikitext", "") + wikicode = mwparserfromhell.parse(wikitext) + + raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} + for tpl in wikicode.filter_templates(): + name = tpl.name.strip() + if name == "ÜbungInfoBox": + for p in tpl.params: + raw[p.name.strip()] = str(p.value).strip() + elif name == "Übungsbeschreibung": + for p in tpl.params: + raw[p.name.strip()] = str(p.value).strip() + elif name == "SkillDevelopment": + raw.setdefault("capabilities", []) + cap = str(tpl.get("PrimaryCapability").value).strip() + lvl = int(str(tpl.get("CapabilityLevel").value).strip()) + raw["capabilities"].append({"capability": cap, "level": lvl}) + raw["wikitext"] = wikitext + return raw + +# Ingestion +def ingest_exercise(payload: Dict[str, Any]) -> None: + title = payload.get("title") + resp = requests.post(EXERCISE_API, json=payload) + if resp.status_code == 422: + print(f"[Error] {title} -> 422: {resp.text}") + return + resp.raise_for_status() + print(f"[Ingest] {title} -> OK") + +# Build payload +def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]: + caps_list = raw.get("capabilities", []) + capabilities = {c["capability"]: c["level"] for c in caps_list} + return { + "title": raw.get("title"), + "summary": raw.get("Summary", ""), + "short_description": raw.get("Summary", ""), + "keywords": raw.get("Schlüsselworte", "").split(', '), + "link": fullurl, + "discipline": raw.get("Übungstyp", ""), + "group": raw.get("Gruppengröße", ""), + "age_group": raw.get("Altersgruppe", ""), + "target_group": raw.get("Zielgruppe", ""), + "min_participants": 1, + "duration_minutes": float(raw.get("Dauer", 0)), + "capabilities": capabilities, + "category": category, + "purpose": raw.get("Ziel", ""), + "execution": raw.get("Durchführung", ""), + "notes": raw.get("Hinweise", ""), + "preparation": raw.get("RefMethode", ""), + "method": raw.get("method", ""), + "equipment": raw.get("equipment", []), + "fullurl": fullurl + } + +# Main +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant") + parser.add_argument("--all", action="store_true", help="Import all exercises") + parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Single exercise title") + parser.add_argument("--category", type=str, default=DEFAULT_CATEGORY, help="Wiki category") + args = parser.parse_args() + + if args.all: + pages = fetch_all_pages(args.category) + print(f"Found {len(pages)} exercises in category '{args.category}'") + for title, entry in pages.items(): + pid = entry.get("pageid") + if not pid: + info = fetch_page_info(title) + pid = info.get("pageid") + fullurl = info.get("fullurl") + else: + fullurl = entry.get("fullurl") or fetch_page_info(title)["fullurl"] + if not pid: + print(f"Skip {title}, no pageid") + continue + raw = parse_exercise(title, pid) + payload = build_payload(raw, fullurl, args.category) + ingest_exercise(payload) + else: + info = fetch_page_info(args.title) + pid = info.get("pageid") + fullurl = info.get("fullurl") + if not pid: + print(f"Error: pageid for '{args.title}' not found") + sys.exit(1) + raw = parse_exercise(args.title, pid) + payload = build_payload(raw, fullurl, args.category) + ingest_exercise(payload)