import os import sys import requests # πŸ“Œ Konfiguration API_URL = "http://localhost:8000/embed" CHUNK_SIZE = 500 OVERLAP = 100 # πŸ“ Kommandozeilenparameter auswerten if len(sys.argv) != 2: print("❌ Bitte gib eine Kategorie an, z.β€―B.: python index_documents.py karatetrainer") sys.exit(1) CATEGORY = sys.argv[1] SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") COLLECTION = CATEGORY if not os.path.exists(SOURCE_DIR): print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.") sys.exit(1) print(f"πŸ“ Lese Dokumente aus: {SOURCE_DIR}") print(f"🎯 Ziel-Collection: {COLLECTION}") # πŸ”§ Text in ΓΌberlappende Chunks aufteilen def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): chunks = [] start = 0 while start < len(text): end = min(start + size, len(text)) chunks.append(text[start:end]) start += size - overlap return chunks # πŸ“š Alle .txt-Dateien im Ordner lesen def read_all_text_files(folder): chunks = [] for filename in os.listdir(folder): if filename.endswith(".txt"): path = os.path.join(folder, filename) with open(path, "r", encoding="utf-8") as f: text = f.read() file_chunks = chunk_text(text) chunks.extend(file_chunks) return chunks # πŸ“€ An API senden def embed_chunks(chunks, collection): response = requests.post(API_URL, json={"texts": chunks, "collection": collection}) response.raise_for_status() return response.json() # πŸš€ Hauptlogik if __name__ == "__main__": texts = read_all_text_files(SOURCE_DIR) if not texts: print("⚠️ Keine Textabschnitte gefunden.") sys.exit(0) print(f"πŸ“¦ {len(texts)} Textabschnitte gefunden, sende an {API_URL}...") try: result = embed_chunks(texts, COLLECTION) print(f"βœ… Ergebnis: {result}") except Exception as e: print(f"❌ Fehler beim Senden: {e}")