import os
import sys
import requests

# 📌 Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100

# 📁 Kommandozeilenparameter auswerten
if len(sys.argv) != 2:
    print("❌ Bitte gib eine Kategorie an, z. B.: python index_documents.py karatetrainer")
    sys.exit(1)

CATEGORY = sys.argv[1]
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
COLLECTION = CATEGORY

if not os.path.exists(SOURCE_DIR):
    print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.")
    sys.exit(1)

print(f"📁 Lese Dokumente aus: {SOURCE_DIR}")
print(f"🎯 Ziel-Collection: {COLLECTION}")

# 🔧 Text in überlappende Chunks aufteilen
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + size, len(text))
        chunks.append(text[start:end])
        start += size - overlap
    return chunks

# 📚 Alle .txt-Dateien im Ordner lesen
def read_all_text_files(folder):
    chunks = []
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            path = os.path.join(folder, filename)
            with open(path, "r", encoding="utf-8") as f:
                text = f.read()
                file_chunks = chunk_text(text)
                chunks.extend(file_chunks)
    return chunks

# 📤 An API senden
def embed_chunks(chunks, collection):
    response = requests.post(API_URL, json={"texts": chunks, "collection": collection})
    response.raise_for_status()
    return response.json()

# 🚀 Hauptlogik
if __name__ == "__main__":
    texts = read_all_text_files(SOURCE_DIR)
    if not texts:
        print("⚠️ Keine Textabschnitte gefunden.")
        sys.exit(0)

    print(f"📦 {len(texts)} Textabschnitte gefunden, sende an {API_URL}...")
    try:
        result = embed_chunks(texts, COLLECTION)
        print(f"✅ Ergebnis: {result}")
    except Exception as e:
        print(f"❌ Fehler beim Senden: {e}")